@Article{ ehrmann_vers_2006,
	title = "Vers une double annotation des Entit{\'e}s Nomm{\'e}es",
	volume = "47",
	url = "http://www.atala.org/Vers-une-double-annotation-des",
	number = "3/2006",
	journal = "Traitement Automatique des Langues",
	author = "Maud Ehrmann and Guillaume Jacquet",
	year = "2006",
	note = "{EHR} 06",
	pages = "63----88"
}

@Article{ jansen_how_2006,
	title = "How are we searching the World Wide Web? A comparison of nine search engine transaction logs",
	volume = "42",
	shorttitle = "How are we searching the World Wide Web?",
	url = "http://portal.acm.org/citation.cfm?id=1710819",
	abstract = "The Web and especially major Web search engines are essential tools in the quest to locate online information for many people. This paper reports results from research that examines characteristics and changes in Web searching from nine studies of five Web search engines based in the {US} and Europe. We compare interactions occurring between users and Web search engines from the perspectives of session length, query length, query complexity, and content viewed among the Web search engines. The results of our research shows (1) users are viewing fewer result pages, (2) searchers on {US-based} Web search engines use more query operators than searchers on European-based search engines, (3) there are statistically significant differences in the use of Boolean operators and result pages viewed, and (4) one cannot necessary apply results from studies of one particular Web search engine to another Web search engine. The wide spread use of Web search engines, employment of simple queries, and decreased viewing of result pages may have resulted from algorithmic enhancements by Web search engine companies. We discuss the implications of the findings for the development of Web search engines and design of online content.",
	number = "1",
	journal = "Inf. Process. Manage.",
	author = "Bernard J. Jansen and Amanda Spink",
	year = "2006",
	note = "{JAN} 06",
	keywords = "transaction log analysis, web search engines, web searching",
	pages = "248--263"
}

@Misc{ kraif_les_2004,
	address = "Caen, France",
	title = "Les corpus multilingues : constitution et applications",
	abstract = "I. {L'Alignement} phrastique Principe et d{\'e}finitions Les indices d'alignement Fonctionnement de l'aligneur Alin{\'e}a Travaux pratiques : points d'ancrage et alignement {II.} {L'Alignement} lexical Probl{\`e}mes de segmentation et degr{\'e}s d'{\'e}quivalence Les indices pour l'extraction de correspondances lexicales Travaux pratiques : correspondances et concordances",
	author = "Olivier Kraif",
	year = "2004"
}

@Misc{ zweigenbaum_corpus_2006,
	address = "Paris",
	title = "Corpus parall{\`e}les et comparables : introduction",
	url = "http://www.limsi.fr/~pz/p11m2r-2006/corpus-paralleles.pdf",
	author = "Pierre Zweigenbaum",
	year = "2006",
	keywords = "corpus comparables, corpus parall{\`e}les, R{\'e}troing{\'e}nierie traductique, techniques d'alignement"
}

@PhDThesis{ ehrmann_les_2008,
	title = "Les entit{\'e}s nomm{\'e}es, de la linguistique au {TAL} : Statut th{\'e}orique et m{\'e}thodes de d{\'e}sambiguïsation",
	school = "Universit{\'e} Paris 7 Denis Diderot",
	author = "Maud Ehrmann",
	year = "2008",
	note = "{EHR} 08"
}

@InProceedings{ brixtel_mesure_2009,
	address = "Avignon, France",
	title = "De la mesure de similarit{\'e} de codes sources vers la d{\'e}tection de plagiat : le ``< {Pomp-O-M{\`e}tre} ''>",
	abstract = "L’objectif de notre travail est la d{\'e}tection de documents plagi{\'e}s au sein d’un corpus. L’application pratique premi{\`e}re est de d{\'e}couvrir, parmi les devoirs de programmation rendus par une classe d’{\'e}tudiants en informatique, lesquels ont {\'e}t{\'e} copi{\'e}s. Notre approche utilise un ensemble de m{\'e}thodes de segmentation des documents ainsi que diff{\'e}rentes distances entre les segments obtenus. Elle est endog{\`e}ne et sans {\`a} priori sur les langages de programmation trait{\'e}s. De plus, elle effectue la synth{\`e}se des r{\'e}sultats pour aider le correcteur {\`a} prendre les bonnes d{\'e}cisions. Cet article commence par pr{\'e}senter le cadre travail et nos hypoth{\`e}ses. Nous donnons ensuite le fonctionnement de chaque {\'e}tape de la cha{\{\\^i}}ne de traitement. Enfin, nous montrons exp{\'e}rimentalement comment, dans diff{\'e}rents corpus issus d’{\'e}tudiants, notre application - le {Pomp-O-M{\`e}tre} - permet le d{\'e}pistage de plagiat.",
	booktitle = "7e Manifestation des Jeunes Chercheurs en Sciences et Technologies de {l'Information} et de la Communication, 16-18 novembre",
	author = "Romain Brixtel and Boris Lesner and Guillaume Bagan and Cyril Bazin",
	year = "2009",
	pages = "8"
}

@InProceedings{ vergne_outil_2003,
	address = "{Batz-sur-Mer,} France",
	title = "Un outil d'extraction terminologique endog{\`e}ne et multilingue",
	volume = "2",
	url = "http://users.info.unicaen.fr/~jvergne/",
	abstract = "Dans cet article, nous pr{\'e}sentons un outil d'extraction terminologique ``endog{\`e}ne'' {\`a} partir d'un corpus multilingue. Cet outil est qualifi{\'e} d'endog{\`e}ne car, sans autre ressource que le corpus dont il doit extraire les termes, il calcule les mots vides {\`a} partir de ce corpus pour centrer les termes candidats sur des mots pleins. Il est plac{\'e} dans le cadre d'un syst{\`e}me de constitution automatique de revue de presse {\`a} partir de sites de presse pr{\'e}sents sur l'internet . Il s'agit de r{\'e}pondre {\`a} des questions telles que : ``de qui, de quoi est-il question aujourd'hui dans la presse de tel espace g{\'e}ographique ou linguistique ?''. Le corpus est constitu{\'e} des textes des hyperliens des {``Unes''} des sites de presse de langues inconnues a priori. Il est renouvel{\'e} quotidiennement, et sa taille est d'environ 100 Ko (d{\'e}balis{\'e}). La m{\'e}thode est fond{\'e}e sur l'analyse distributionnelle, et utilise des diff{\'e}rences entre mots contigus : les diff{\'e}rences de longueur et d'effectif.",
	booktitle = "Actes de {TALN,} 11 au 14 juin 2003",
	author = "Jacques Vergne",
	year = "2003",
	keywords = "endog{\`e}ne, extraction terminologique, fouille de texte, internet, multilingue",
	pages = "139--148"
}

@Article{ navarro_guided_2001,
	title = "A guided tour to approximate string matching",
	volume = "33",
	url = "http://portal.acm.org/citation.cfm?id=375360.375365\&coll=GUIDE\&dl=GUIDE\&CFID=85648535\&CFTOKEN=64298784",
	doi = "10.1145/375360.375365",
	abstract = "We survey the current techniques to cope with the problem of string matching that allows errors. This is becoming a more and more relevant issue for many fast growing areas such as information retrieval and computational biology. We focus on online searching and mostly on edit distance, explaining the problem and its relevance, its statistical behavior, its history and current developments, and the central ideas of the algorithms and their complexities. We present a number of experiments to compare the performance of the different algorithms and show which are the best choices. We conclude with some directions for future work and open problems.",
	number = "1",
	journal = "{ACM} Comput. Surv.",
	author = "Gonzalo Navarro",
	year = "2001",
	keywords = "edit distance, levenshtein distance, online string matching, text searching allowing errors",
	pages = "31--88"
}

@Article{ och_alignment_2004,
	title = "The Alignment Template Approach to Statistical Machine Translation",
	volume = "30",
	url = "http://portal.acm.org/citation.cfm?id=1105587.1105589\&coll=Portal\&dl=GUIDE\&CFID=76577594\&CFTOKEN=73477001",
	abstract = "A phrase-based statistical machine translation approach — the alignment template approach — is described. This translation approach allows for general many-to-many relations between words. Thereby, the context of words is taken into account in the translation model, and local changes in word order from source to target language can be learned explicitly. The model is described using a log-linear modeling approach, which is a generalization of the often used source--channel approach. Thereby, the model is easier to extend than classical statistical machine translation systems. We describe in detail the process for learning phrasal translations, the feature functions used, and the search algorithm. The evaluation of this approach is performed on three different tasks. For the {German--English} speech {VERBMOBIL} task, we analyze the effect of various system components. On the {French--English} Canadian {HANSARDS} task, the alignment template system obtains significantly better results than a single-word-based translation model. In the {Chinese--English} 2002 National Institute of Standards and Technology {(NIST)} machine translation evaluation it yields statistically significantly better {NIST} scores than all competing research and commercial translation systems.",
	number = "4",
	journal = "Comput. Linguist.",
	author = "Franz Josef Och and Hermann Ney",
	year = "2004",
	pages = "417--449"
}

@InProceedings{ all_lambiguite_2009,
	address = "La D{\'e}fense, Paris",
	title = "L’ambiguït{\'e} dans les sciences du langage",
	booktitle = "Coldoc09",
	author = "All",
	month = jun,
	year = "2009",
	keywords = "d{\'e}sambiguïsation par le contexte, kanji, types d'ambiguït{\'e}"
}

@PhDThesis{ giguet_methode_1998,
	address = "Caen",
	title = "M{\'e}thode pour l'analyse automatique de structures formelles sur documents multilingues",
	url = "http://www.google.fr/search?q=users.info.unicaen.fr%2F~giguet%2Fthese%2F\&ie=utf-8\&oe=utf-8\&aq=t\&rls=org.mozilla:fr:official\&client=firefox-a",
	abstract = "Cette th{\`e}se traite de l'analyse automatique de structures formelles de l'{\'e}crit. Elle commence par une excursion dans le multilinguisme au cours de laquelle nous pr{\'e}sentons les documents dans leur dimension multilingue et montrons la n{\'e}cessit{\'e} de les traiter comme tels. Nous {\'e}tudions leur structure multilingue et d{\'e}veloppons comment la calculer {\`a} l'aide d'un identificateur de langues. Nous poursuivons par l'expos{\'e} d'une m{\'e}thode originale d'analyse syntaxique automatique d'{\'e}nonc{\'e}s fran\c{c}ais tout-venants. Cette m{\'e}thode est issue de nos travaux de g{\'e}n{\'e}ralisation et d'abstraction des recherches de Jacques Vergne. Les structures syntaxiques auxquelles nous nous sommes particuli{\`e}rement int{\'e}ress{\'e} sont le syntagme minimal et la proposition ; deux unit{\'e}s auxquelles il est possible d'associer une d{\'e}finition ayant une validit{\'e} multilingue, ce qui rend la m{\'e}thode applicable {\`a} diverses langues. Nous proposons deux processus permettant la construction de ces unit{\'e}s. Ces processus consid{\`e}rent les {\'e}nonc{\'e}s comme des flux textuels et construisent chacun leurs structures syntaxiques par propagation de contraintes relationnelles. Les structures intra-syntagmatique et intra-propositionnelle {\'e}tant d{\'e}pendantes, elles sont construites par l'interaction des deux processus, le second processus acceptant de travailler sur des unit{\'e}s partiellement d{\'e}finies. Enfin, nous montrons que les deux processus sont identiques si l'on fait abstraction de la nature de l'unit{\'e} qu'ils construisent et de la base de r{\`e}gles qu'ils manipulent. Le fil conducteur de cette th{\`e}se est la m{\'e}thode. {\`A} chaque calcul de structure, nous mettons en effet l'accent sur la m{\'e}thode ayant permis son obtention. Nous montrons que cette m{\'e}thode est unique. Chaque structure est en effet calcul{\'e}e {\`a} partir d'indices formels et positionnels {\`a} la fois internes et externes : internes par l'{\'e}tude des unit{\'e}s qui composent la structure, externes par l'{\'e}tude du r{\{\\^o}}le de cette structure dans l'unit{\'e} qui l'int{\`e}gre.",
	school = "Universit{\'e} de {Caen/Basse-Normandie}",
	author = "Emmanuel Giguet",
	year = "1998"
}

@Article{ berners-lee_semantic_2001,
	title = "The semantic web",
	volume = "284",
	number = "5",
	journal = "Scientific American",
	author = "Tim {Berners-Lee} and James Hendler and Ora Lassila",
	year = "2001",
	note = "{BER} 01",
	pages = "34--43"
}

@InProceedings{ mimno_polylingual_2009,
	address = "Singapore",
	title = "Polylingual topic models",
	isbn = "978-1-932432-62-6",
	url = "http://portal.acm.org/citation.cfm?id=1699571.1699627\&coll=GUIDE\&dl=GUIDE\&CFID=78588730\&CFTOKEN=70290720",
	abstract = "Topic models are a useful tool for analyzing large text collections, but have previously been applied in only monolingual, or at most bilingual, contexts. Meanwhile, massive collections of interlinked documents in dozens of languages, such as Wikipedia, are now widely available, calling for tools that can characterize content in many languages. We introduce a polylingual topic model that discovers topics aligned across multiple languages. We explore the model's characteristics using two large corpora, each with over ten different languages, and demonstrate its usefulness in supporting machine translation and tracking topic trends across languages.",
	booktitle = "Proceedings of the 2009 Conference on Empirical Methods in Natural Language Processing: Volume 2 - Volume 2",
	publisher = "Association for Computational Linguistics",
	author = "David Mimno and Hanna M. Wallach and Jason Naradowsky and David A. Smith and Andrew {McCallum}",
	year = "2009",
	pages = "880--889"
}

@Article{ simard_text-translation_1999,
	title = "{Text-Translation} Alignment: Three Languages Are Better Than Two",
	shorttitle = "{Text-Translation} Alignment",
	url = "http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.14.6716",
	journal = "{IN} {PROC.} {OF} {EMNLP/VLC}",
	author = "Michel Simard",
	year = "1999",
	pages = "2----11",
	annote = "{{\textless}p{\textgreater}Simard} montre que l'alignement par paire n'est pas optimal, et que l'alignement simultan{\'e} peut am{\'e}liorer le r{\'e}sultat global{\textless}/p{\textgreater} {\textless}p{\textgreater}cf Crego, Max {\&amp;Yvon,} 2009{\textless}/p{\textgreater} {\textless}p{\textgreater}de nombreuses sources, cf Union euro, font intervenir plus de deux langues{\textless}/p{\textgreater}"
}

@InProceedings{ chiao_evaluation_2006,
	address = "{Genoa/Italy}",
	title = "Evaluation of multilingual text alignment systems: the {ARCADE} {II} project",
	shorttitle = "Evaluation of multilingual text alignment systems",
	url = "http://hal.inria.fr/inria-00115670_v1/",
	author = "{Yun-Chuang} Chiao and Olivier Kraif and Dominique Laurent and Thi Minh Huyen Nguyen and Nasredine Semmar and Fran\c{c}ois Stuck and Jean V{\'e}ronis and Wajdi Zaghouani",
	year = "2006",
	note = "This paper describes the {ARCADE} {II} project, concerned with the evaluation of parallel text alignment systems. The {ARCADE} {II} project aims at exploring the techniques of multilingual text alignment through a fine evaluation of the existing techniques and the development of new alignment methods. The evaluation campaign consists of two tracks devoted to the evaluation of alignment at sentence and word level respectively. It differs from {ARCADE} I in the multilingual aspect and the investigation of lexical alignment."
}

@PhDThesis{ varma_identifying_2002,
	address = "Duluth, Minnesota, {U.S.A}",
	title = "Identifying Word Translations in Parallel Corpora Using Measures of Association",
	url = "http://www.d.umn.edu/~tpederse/students.html",
	abstract = "There are increasing amounts of parallel text available online. Such text consists of an original document and its translation into another language. This thesis takes the view that such data is a very rich source of knowledge that can be utilized to learn how languages can be translated from one to the other. In particular, this thesis focuses on developing techniques that can be used to learn which words are translations of each other, simply based on information found in a large sample of parallel text. The methods employed here are measures of association that have been used in a wide range of statistical applications, and have proven very useful in corpus based natural language processing. In this thesis we explore their use in identifying which words are translations of each other. This thesis starts with an examination of one of the earliest of such approaches, known as K-vec {(Fung} and Church, 1994). We offer several improvements to this algorithm that lead to demonstrably better results in two very different domains. We also evaluate a number of measures of association and identify the T-score, the Log-likelihood Ratio, and the Odds Ratio as being particularly effective. Finally, we propose two ensemble techniques for combining different measures of associations and also for combining different formulations of the same measure and show that both lead to improved results.",
	school = "University of Minnesota",
	author = "Nitin Varma",
	month = dec,
	year = "2002",
	keywords = "K-vec Algorithm"
}

@InProceedings{ lardilleux_multilingual_2008,
	address = "Manchester : {Royaume-Uni}",
	title = "Multilingual alignments by monolingual string differences",
	url = "http://hal.archives-ouvertes.fr/hal-00368710_v1/",
	author = "Adrien Lardilleux and Yves Lepage",
	month = aug,
	year = "2008",
	note = "We propose a method to obtain subsentential alignments from several languages simultaneously. The method handles several languages at once, and avoids the complexity explosion due to the usual pair-by-pair processing. It can be used for different units (characters, morphemes, words, chunks). An evaluation of word alignments with a trilingual machine translation corpus has been conducted. A comparison of the results with those obtained by state of the art alignment software is reported."
}

@InProceedings{ lardilleux_anymalign_2009,
	address = "Senlis, France",
	title = "anymalign : un outil d’alignement sous-phrastique libre pour les {\{\\^e}}tres humains",
	abstract = "Nous pr{\'e}sentons anymalign, un aligneur sous-phrastique grand public. Ses r{\'e}sultats ont une qualit{\'e} qui rivalise avec le meilleur outil du domaine, {GIZA++.} Il est rapide et simple d’utilisation, et permet de produire dictionnaires et autres tables de traduction en une seule commande. {\`A} notre connaissance, c’est le seul outil au monde permettant d’aligner un nombre quelconque de langues simultan{\'e}ment. Il s’agit donc du premier aligneur sousphrastique r{\'e}ellement multilingue.",
	booktitle = "Actes de la 16{\`e}me conf{\'e}rence annuelle sur le Traitement Automatique des Langues Naturelles",
	author = "Adrien Lardilleux and Yves Lepage",
	year = "2009"
}

@Article{ tasmowski_presentation_2005,
	title = "Pr{\'e}sentation",
	volume = "50",
	issn = "0082-6049",
	url = "http://www.cairn.info/revue-travaux-de-linguistique-2005-1-page-7.htm",
	doi = "10.3917/tl.050.0007",
	number = "1",
	journal = "Travaux de linguistique",
	author = "Liliane Tasmowski and Svetlana Vogeleer",
	year = "2005",
	pages = "7"
}

@InProceedings{ debili_aligning_1992,
	address = "Nantes, France",
	title = "Aligning sentences in bilingual texts: {French-English} and {French-Arabic}",
	shorttitle = "Aligning sentences in bilingual texts",
	url = "http://portal.acm.org/citation.cfm?id=992151\&dl=GUIDE\&coll=GUIDE\&CFID=78336177\&CFTOKEN=78125505",
	abstract = "In this paper, we will tackle the problem raised by the automatic alignment of sentences belonging to bilingual text pairs. The method that we advocate here is inspired by what a person with a fair knowledge of the other langage would do intuitively. It is based on the matching of the elements which are similar in both sentences. However, to match these elements correctly, we first have to match the sentences that contain them. There seems to be a vicious circle here. We will show how to break it. On the one hand, we will describe the hypotheses we made, and, on the other hand, the algorithms which ensued. The experiments are carried out with {French-English} and {French-Arabic} text {pairs.We} will show that matching sentences and, later, expressions, amounts to raising a new problem in the machine translation field, i. e. the problem of recognition instead of that of translation, strictly speaking.",
	booktitle = "Proceedings of the 14th conference on Computational linguistics - Volume 2",
	publisher = "Association for Computational Linguistics",
	author = "Fathi Debili and Ely{\`e}s Sammouda",
	year = "1992",
	pages = "517--524",
	annote = "{\textless}p{\textgreater}circularit{\'e}, cercle vicieux, entre alignement de phrases et alignement de mots{\textless}/p{\textgreater}"
}

@InProceedings{ papineni_bleu:_2002,
	address = "Philadelphia, Pennsylvania",
	title = "{BLEU:} a method for automatic evaluation of machine translation",
	shorttitle = "{BLEU}",
	url = "http://portal.acm.org/citation.cfm?id=1073135",
	abstract = "Human evaluations of machine translation are extensive but expensive. Human evaluations can take months to finish and involve human labor that can not be reused. We propose a method of automatic machine translation evaluation that is quick, inexpensive, and language-independent, that correlates highly with human evaluation, and that has little marginal cost per run. We present this method as an automated understudy to skilled human judges which substitutes for them when there is need for quick or frequent evaluations.",
	booktitle = "Proceedings of the 40th Annual Meeting on Association for Computational Linguistics",
	publisher = "Association for Computational Linguistics",
	author = "Kishore Papineni and Salim Roukos and Todd Ward and {Wei-Jing} Zhu",
	year = "2002",
	pages = "311--318"
}

@Article{ isabelle_bi-textualite_1992,
	title = "La bi-textualit{\'e} : vers une nouvelle g{\'e}n{\'e}ration d’aides {\`a} la traduction et la terminologie",
	volume = "37",
	issn = "0026-0452",
	shorttitle = "La bi-textualit{\'e}",
	url = "http://id.erudit.org/iderudit/003228ar",
	number = "4",
	journal = "Meta",
	author = "Pierre Isabelle",
	year = "1992",
	keywords = "bi-texte, concordancier bilingue, critique de traduction, faux amis / deceptive cognates, m{\'e}moire d'entreprise, postes de travail du traducteur",
	pages = "721--737"
}

@Article{ och_systematic_2003,
	title = "A systematic comparison of various statistical alignment models",
	volume = "29",
	url = "http://portal.acm.org/citation.cfm?id=778822.778824\&coll=GUIDE\&dl=GUIDE\&CFID=76577594\&CFTOKEN=73477001",
	abstract = "We present and compare various methods for computing word alignments using statistical or heuristic models. We consider the five alignment models presented in Brown, Della Pietra, Della Pietra, and Mercer (1993), the hidden Markov alignment model, smoothing techniques, and refinements. These statistical models are compared with two heuristic models based on the Dice coefficient. We present different methods for combining word alignments to perform a symmetrization of directed statistical alignment models. As evaluation criterion, we use the quality of the resulting Viterbi alignment compared to a manually produced reference alignment. We evaluate the models on the {German-English} Verbmobil task and the {French-English} Hansards task. We perform a detailed analysis of various design decisions of our statistical alignment system and evaluate these on training corpora of various sizes. An important result is that refined alignment models with a first-order dependence and a fertility model yield significantly better results than simple heuristic models. In the Appendix, we present an efficient training algorithm for the alignment models presented.",
	number = "1",
	journal = "Comput. Linguist.",
	author = "Franz Josef Och and Hermann Ney",
	year = "2003",
	pages = "19--51"
}

@InProceedings{ brun_experience_2009,
	address = "Senlis, France",
	title = "Une Exp{\'e}rience de Fusion pour {l'Annotation} {d'Entit{\'e}s} Nomm{\'e}es",
	url = "http://www-lipn.univ-paris13.fr/taln09/pdf/TALN_24.pdf",
	booktitle = "16{\`e}me Conf{\'e}rence sur le Traitement Automatique des Langues Naturelles {(TALN'09)}",
	author = "Caroline Brun and Nicolas Dessaigne and Maud Ehrmann and Baptiste Gaillard and Sylvie {Guillemin-Lanne} and Guillaume Jacquet and Aaron Kaplan and Marianna Kucharski and Claude Martineau and Aur{\'e}lie Migeotte and Takuya Nakamura and Stavroula Voyatzi",
	year = "2009",
	note = "{BRU} 09"
}

@Article{ fung_technical_1997,
	title = "A Technical Word- and {Term-Translation} Aid Using Noisy Parallel Corpora across Language Groups",
	volume = "12",
	url = "http://portal.acm.org/citation.cfm?id=593161",
	abstract = "Technical-term translation represents one of the most difficult tasks for human translators since (1) most translators are not familiar with terms and domain-specific terminology and (2) such terms are not adequately covered by printed dictionaries. This paper describes an algorithm for translating technical words and terms from noisy parallel corpora across language groups. Given any word which is part of a technical term in the source language, the algorithm produces a ranked candidate match for it in the target language. Potential translations for the term are compiled from the matched words and are also ranked. We show how this ranked list helps translators in technical-term translation. Most algorithms for lexical and term translation focus on {Indo-European} language pairs, and most use a sentence-aligned clean parallel corpus without insertion, deletion or {OCR} noise. Our algorithm is language- and character-set-independent, and is robust to noise in the corpus. We show how our algorithm requires minimum preprocessing and is able to obtain technical-word translations without sentence-boundary identification or sentence alignment, from the {English--Japanese} awk manual corpus with noise arising from text insertions or deletions and on the {English--Chinese} {HKUST} bilingual corpus. We obtain a precision of 55.35\% from the awk corpus for word translation including rare words, counting only the best candidate and direct translations. Translation precision of the best-candidate translation is 89.93\% from the {HKUST} corpus. Potential term translations produced by the program help bilingual speakers to get a 47\% improvement in translating technical terms.",
	number = "1/2",
	journal = "Machine Translation",
	author = "Pascale Fung and Kathleen {McKeown}",
	year = "1997",
	keywords = "alignment, chinese, noisy corpora, technical terms, translation aid",
	pages = "53--87"
}

@InProceedings{ brown_aligning_1991,
	address = "Berkeley, California",
	title = "Aligning sentences in parallel corpora",
	url = "http://portal.acm.org/citation.cfm?id=981344.981366\&coll=Portal\&dl=GUIDE\&CFID=76577594\&CFTOKEN=73477001",
	abstract = "In this paper we describe a statistical technique for aligning sentences with their translations in two parallel corpora. In addition to certain anchor points that are available in our data, the only information about the sentences that we use for calculating alignments is the number of tokens that they contain. Because we make no use of the lexical details of the sentence, the alignment computation is fast and therefore practical for application to very large collections of text. We have used this technique to align several million sentences in the {English-French} Hansard corpora and have achieved an accuracy in excess of 99\% in a random selected set of 1000 sentence pairs that we checked by hand. We show that even without the benefit of anchor points the correlation between the lengths of aligned sentences is strong enough that we should expect to achieve an accuracy of between 96\% and 97\%. Thus, the technique may be applicable to a wider variety of texts than we have yet tried.",
	booktitle = "Proceedings of the 29th annual meeting on Association for Computational Linguistics",
	publisher = "Association for Computational Linguistics",
	author = "Peter F. Brown and Jennifer C. Lai and Robert L. Mercer",
	year = "1991",
	pages = "169--176",
	annote = "{\textless}p{\textgreater}algo de programmation dynamique d{\'e}pendant de points d'ancrage : les paragraphes.{\textless}/p{\textgreater} {\textless}p{\textgreater} {\textless}/p{\textgreater}"
}

@InProceedings{ chinchor_muc-7_1997,
	title = "{MUC-7} named entity task definition",
	booktitle = "Proceedings of the 7th Message Understanding Conference",
	author = "Nancy Chinchor",
	year = "1997",
	note = "{CHI} 97"
}

@PhDThesis{ nakamura-delloye_alignement_2007,
	address = "Paris, France",
	title = "Alignement Automatique de Textes Parall{\`e}les {Fran\c{c}ais-Japonais}",
	url = "http://tel.archives-ouvertes.fr/tel-00259276/en/",
	abstract = "L'alignement automatique consiste {\`a} trouver une correspondance entre des unit{\'e}s de textes parall{\`e}les. Nous nous int{\'e}ressons plus particuli{\`e}rement {\`a} la r{\'e}alisation d'un syst{\`e}me qui proc{\`e}de {\`a} l'alignement au niveau des propositions, unit{\'e}s profitables dans beaucoup d'applications. La pr{\'e}sente th{\`e}se est constitu{\'e}e de deux types de travaux : les travaux introducteurs et ceux constituant le noyau central. Ce dernier s'articule autour de la notion de proposition syntaxique. Les travaux introducteurs comprennent l'{\'e}tude des g{\'e}n{\'e}ralit{\'e}s sur l'alignement ainsi que des travaux consacr{\'e}s {\`a} l'alignement des phrases. Ces travaux ont conduit {\`a} la r{\'e}alisation d'un syst{\`e}me d'alignement des phrases adapt{\'e} au traitement des textes fran\c{c}ais et japonais. Le noyau de la th{\`e}se est compos{\'e} de deux types de travaux, {\'e}tudes linguistiques et r{\'e}alisations informatiques. Les {\'e}tudes linguistiques se divisent elles-m{\{\\^e}}mes en deux sujets : la proposition en fran\c{c}ais et la proposition en japonais. Le but de nos {\'e}tudes sur la proposition fran\c{c}aise est de d{\'e}finir une grammaire pour la d{\'e}tection des propositions. Pour cet effet, nous avons cherch{\'e} {\`a} d{\'e}finir une typologie des propositions, bas{\'e}e sur des crit{\`e}res uniquement formels. Dans les {\'e}tudes sur le japonais, nous d{\'e}finissons d'abord la phrase japonaise sur la base de l'opposition th{\`e}me-rh{\`e}me. Nous tentons ensuite d'{\'e}lucider la notion de proposition. Les r{\'e}alisations informatiques comportent trois t{\{\\^a}}ches composant ensemble au final l'op{\'e}ration d'alignement des propositions, incarn{\'e}es par trois syst{\`e}mes informatiques distincts : deux d{\'e}tecteurs de propositions (un pour le fran\c{c}ais et un pour le japonais), ainsi qu'un syst{\`e}me d'alignement des propositions.",
	school = "Universit{\'e} Denis Diderot, Paris {VII}",
	author = "Yayoi {Nakamura-Delloye}",
	year = "2007",
	keywords = "Alignement, analyse morphologique japonaise partielle, analyse syntaxique partielle, appariement de graphes, {CFG, } classification ascendante hi{\'e}rarchique, corpus parall{\`e}les, {DCG, } linguistique contrastive, m{\'e}moire de traduction, Prolog, proposition syntaxique, subordination"
}

@Article{ kaerkkaeinen_linear_2006,
	title = "Linear work suffix array construction",
	volume = "53",
	url = "http://portal.acm.org/citation.cfm?id=1217858",
	doi = "10.1145/1217856.1217858",
	abstract = "Suffix trees and suffix arrays are widely used and largely interchangeable index structures on strings and sequences. Practitioners prefer suffix arrays due to their simplicity and space efficiency while theoreticians use suffix trees due to linear-time construction algorithms and more explicit structure. We narrow this gap between theory and practice with a simple linear-time construction algorithm for suffix arrays. The simplicity is demonstrated with a C\&plus;\&plus; implementation of 50 effective lines of code. The algorithm is called {DC3,} which stems from the central underlying concept of difference cover. This view leads to a generalized algorithm, {DC,} that allows a space-efficient implementation and, moreover, supports the choice of a space--time tradeoff. For any v ∈ \&lsqb;1,\&nradic;\&rsqb;, it runs in O(vn) time using O(n/\&vradic;) space in addition to the input string and the suffix array. We also present variants of the algorithm for several parallel and hierarchical memory models of computation. The algorithms for {BSP} and {EREW-PRAM} models are asymptotically faster than all previous suffix tree or array construction algorithms.",
	number = "6",
	journal = "J. {ACM}",
	author = "Juha K{\"a}rkk{\"a}inen and Peter Sanders and Stefan Burkhardt",
	year = "2006",
	keywords = "difference cover, external memory algorithms, suffix array",
	pages = "918--936"
}

@InProceedings{ melamed_portable_1997,
	address = "Madrid, Spain",
	title = "A portable algorithm for mapping bitext correspondence",
	url = "http://portal.acm.org/citation.cfm?id=976909.979656\&coll=GUIDE\&dl=GUIDE\&CFID=76577594\&CFTOKEN=73477001",
	abstract = "The first step in most empirical work in multilingual {NLP} is to construct maps of the correspondence between texts and their translations (bitext maps). The Smooth Injective Map Recognizer {(SIMR)} algorithm presented here is a generic pattern recognition algorithm that is particularly well-suited to mapping bitext correspondence. {SIMR} is faster and significantly more accurate than other algorithms in the literature. The algorithm is robust enough to use on noisy texts, such as those resulting from {OCR} input, and on translations that are not very literal. {SIMR} encapsulates its language-specific heuristics, so that it can be ported to any language pair with a minimal effort.",
	booktitle = "Proceedings of the 35th Annual Meeting of the Association for Computational Linguistics and Eighth Conference of the European Chapter of the Association for Computational Linguistics",
	publisher = "Association for Computational Linguistics",
	author = "I. Dan Melamed",
	year = "1997",
	pages = "305--312"
}

@Book{ jonasson_nom_1994,
	title = "Le nom propre",
	isbn = "2801110779",
	publisher = "Duculot - De Boeck",
	author = "Kerstin Jonasson",
	month = may,
	year = "1994",
	note = "{JON} 94"
}

@Article{ cochrane_foisonnement_2007,
	title = "Le foisonnement, ph{\'e}nom{\`e}ne complexe",
	volume = "8",
	url = "http://id.erudit.org/iderudit/037222ar",
	number = "2",
	author = "Guylaine Cochrane",
	year = "2007"
}

@Article{ jain_data_1999,
	title = "Data clustering: a review",
	volume = "31",
	shorttitle = "Data clustering",
	url = "http://portal.acm.org/citation.cfm?id=331504",
	doi = "10.1145/331499.331504",
	abstract = "Clustering is the unsupervised classification of patterns (observations, data items, or feature vectors) into groups (clusters). The clustering problem has been addressed in many contexts and by researchers in many disciplines; this reflects its broad appeal and usefulness as one of the steps in exploratory data analysis. However, clustering is a difficult problem combinatorially, and differences in assumptions and contexts in different communities has made the transfer of useful generic concepts and methodologies slow to occur. This paper presents an overview of pattern clustering methods from a statistical pattern recognition perspective, with a goal of providing useful advice and references to fundamental concepts accessible to the broad community of clustering practitioners. We present a taxonomy of clustering techniques, and identify cross-cutting themes and recent advances. We also describe some important applications of clustering algorithms such as image segmentation, object recognition, and information retrieval.",
	number = "3",
	journal = "{ACM} Comput. Surv.",
	author = "A. K. Jain and M. N. Murty and P. J. Flynn",
	year = "1999",
	keywords = "cluster analysis, clustering applications, exploratory data analysis, incremental clustering, similarity indices, unsupervised learning",
	pages = "264--323"
}

@InProceedings{ chang_alignment_1997,
	address = "Madrid, Spain",
	title = "An alignment method for noisy parallel corpora based on image processing techniques",
	url = "http://portal.acm.org/citation.cfm?id=979617.979655\&coll=GUIDE\&dl=GUIDE\&CFID=78470726\&CFTOKEN=79586012",
	abstract = "This paper presents a new approach to bitext correspondence problem {(BCP)} of noisy bilingual corpora based on image processing {(IP)} techniques. By using one of several ways of estimating the lexical translation probability {(LTP)} between pairs of source and target words, we can turn a bitext into a discrete gray-level image. We contend that the {BCP,} when seen in the light, bears a striking resemblance to the line detection problem in {IP.} Therefore, {BCPs,} including sentence and word alignment, can benefit from a wealth of effective, well established {IP} techniques, including convolution-based filters, texture analysis and Hough transform. This paper describes a new program, {PlotAlign} that produces a word-level bitext map for noisy or non-literal bitext, based on these techniques.",
	booktitle = "Proceedings of the eighth conference on European chapter of the Association for Computational Linguistics",
	publisher = "Association for Computational Linguistics",
	author = "Jason S. Chang and Mathis H. Chen",
	year = "1997",
	pages = "297--304",
	annote = "{\textless}p{\textgreater}programme Plot\_align{\textless}/p{\textgreater} {{\textless}p{\textgreater}Chang} propose une m{\'e}thode d'ali bruts bas{\'e}e sur des techniques de traitement de l'image {(IP),} qui sont insensibles aux d{\'e}calages entre deux volets comme les suppressions, ou les traductions non-litt{\'e}rales.{\textless}/p{\textgreater} {{\textless}p{\textgreater}Sa} m{\'e}thode int{\`e}gre des filtres {\`a} convolution, de l'analyse de texture et une transform{\'e}e de Hough.{\textless}/p{\textgreater} {\textless}p{\textgreater} {\textless}/p{\textgreater}"
}

@Article{ dejean_nouvelle_2002,
	title = "Une nouvelle approche {\`a} l’extraction de lexiques bilingues {\`a} partir de corpus comparables",
	volume = "Alignement lexical dans les corpus multilingues",
	abstract = "{RESUME.} Nous proposons dans cet article une nouvelle m{\'e}thode pour l'extraction de lexiques bilingues de corpus comparables. Pour ce faire, nous revenons tout d'abord sur les hypoth{\`e}ses sousjacentes aux travaux dans ce domaine, et d{\'e}taillons ensuite les algorithmes qui en d{\'e}coulent. Enfin, nous {\'e}valuons notre approche sur deux corpus aux caract{\'e}ristiques diff{\'e}rentes, et montrons comment la combinaison de notre m{\'e}thode avec les m{\'e}thodes standard am{\'e}liore de fa\c{c}on significative les r{\'e}sultats. {MOTS-CLES} : extraction de lexique bilingue, corpus comparable, thesaurus multilingue",
	number = "Num{\'e}ro sp{\'e}cial",
	journal = "Lexicometrica",
	author = "Herv{\'e} D{\'e}jean and Eric Gaussier",
	year = "2002"
}

@PhDThesis{ nguyen_extraction_2006,
	address = "Caen, France",
	title = "Extraction d’information {\`a} partir de documents Web multilingues : une approche d’analyses structurelles",
	url = "http://tel.archives-ouvertes.fr/tel-00258948/en/",
	school = "Universit{\'e} de {Caen/Basse-Normandie}",
	author = "Dang Tuan Nguyen",
	year = "2006"
}

@Misc{ langlais_bitexte_2005,
	address = "Antananarivo - Madagascar",
	title = "Le bitexte et ses applications",
	url = "http://www.iro.umontreal.ca/~felipe/Papers/slides-tana-2005.pdf",
	author = "Philippe Langlais",
	year = "2005",
	keywords = "corpus parall{\`e}les, techniques d'alignement"
}

@InProceedings{ poibeau_sur_2005,
	title = "Sur le statut r{\'e}f{\'e}rentiel des entit{\'e}s nomm{\'e}es",
	url = "http://arxiv.org/abs/cs/0510020",
	abstract = "We show in this paper that, on the one hand, named entities can be designated using different denominations and that, on the second hand, names denoting named entities are polysemous. The analysis cannot be limited to reference resolution but should take into account naming strategies, which are mainly based on two linguistic operations: synecdoche and metonymy. Lastly, we present a model that explicitly represents the different denominations in discourse, unifying the way to represent linguistic knowledge and world knowledge.",
	booktitle = "Actes de la Conf{\'e}rence Traitement Automatique des Langues Naturelles",
	author = "Thierry Poibeau",
	year = "2005",
	note = "{POI} 05",
	keywords = "Computer Science - Artificial Intelligence, Computer Science - Information Retrieval"
}

@Article{ langlais_alignement_1997,
	title = "Alignement de corpus bilingues : int{\'e}r{\{\\^e}}ts, algorithmes et {\'e}valuations",
	volume = "num{\'e}ro Hors S{\'e}rie",
	url = "http://www.iro.umontreal.ca/~felipe/Papers/fractal97.ps",
	journal = "Bulletin de Linguistique Appliqu{\'e}e et G{\'e}n{\'e}rale",
	author = "Philippe Langlais",
	month = dec,
	year = "1997",
	pages = "245--254",
	annote = "{\textless}!-- @page { margin: 2cm } P { margin-bottom: 0.21cm } --{\textgreater} {\textless}p style=``margin-left: 0.09cm; margin-bottom: 0cm;''{\textgreater}importance de la combinaison des diff{\'e}rentes sources d'informations (lexique, cognates, longueur de phrases, fr{\'e}quence des appariemments{\textless}/p{\textgreater} {\textless}p style=``margin-left: 0.09cm; margin-bottom: 0cm;''{\textgreater}voir {\'e}galement Melamed, 2000{\textless}/p{\textgreater}"
}

@Article{ fung_aligning_1994,
	title = "Aligning Noisy Parallel Corpora Across Language Groups: Word Pair Feature Matching by Dynamic Time Warping",
	shorttitle = "Aligning Noisy Parallel Corpora Across Language Groups",
	url = "http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.53.4548",
	journal = "{IN} {PROCEEDINGS} {OF} {THE} {FIRST} {CONFERENCE} {OF} {THE} {ASSOCIATION} {FOR} {MACHINE} {TRANSLATION} {IN} {THE} {AMERICAS,} 81--88",
	author = "Pascale Fung and Kathleen Mckeown",
	year = "1994",
	pages = "81----88"
}

@Article{ ozdowska_trois_2007,
	title = "Trois exp{\'e}riences d’{\'e}valuation dans le cadre du d{\'e}veloppement d’un syst{\`e}me d’alignement sous-phrastique",
	volume = "48",
	url = "http://www.citeulike.org/user/Warius/article/3443621",
	abstract = "Nous pr{\'e}sentons la d{\'e}marche que nous avons adopt{\'e}e pour mener {\`a} bien une {\'e}valuation syst{\`e}me dans le contexte du d{\'e}veloppement d’un syst{\`e}me d’alignement sous-phrastique, {ALIBI.} {\`A} cet {\'e}gard, nous examinons trois proc{\'e}dures d’{\'e}valuation qui correspondent {\`a} des aspects fondamentaux de la mise au point de syst{\`e}mes de traitement automatique des langues : une {\'e}valuation par annotation des sorties du syst{\`e}me qui permet d’observer le comportement de chaque composante prise isol{\'e}ment, une {\'e}valuation avec des r{\'e}f{\'e}rences multicorpus qui permet d’observer le comportement d’un syst{\`e}me selon le type de corpus qu’il prend en entr{\'e}e et une {\'e}valuation avec une r{\'e}f{\'e}rence standard disponible publiquement qui permet d’observer son comportement par rapport {\`a} des outils de m{\{\\^e}}me famille. Nous d{\'e}crivons chaque exp{\'e}rience d’{\'e}valuation et faisons le point sur la nature des r{\'e}sultats qu’elle fournit ainsi que leurs apports.",
	number = "1",
	journal = "{TAL}",
	author = "Sylwia Ozdowska",
	year = "2007",
	keywords = "evaluation, precision, rappel",
	annote = "{\textless}!-- @page { margin: 2cm } P { margin-bottom: 0.21cm } --{\textgreater} {\textless}p {align=``JUSTIFY''{\textgreater}{\textless}span} style=``font-style: normal;''{\textgreater}3 types {\'e}valuations : {\textless}/span{\textgreater}{\textless}span style=``font-family: Times New Roman,serif;''{\textgreater}{\textless}span style=``font-size: small;''{\textgreater}{\'e}valuation syst{\`e}me, {\'e}valuation par la t{\{\\^a}}che et {\'e}valuation utilisateur{\textless}/span{\textgreater}{\textless}/span{\textgreater}{\textless}/p{\textgreater} {\textless}p style=``font-style: normal;'' {align=``JUSTIFY''{\textgreater}-} une {\'e}valuation par annotation des sorties du syst{\`e}me qui permet d’observer le comportement de chaque composante prise isol{\'e}ment,{\textless}/p{\textgreater} {\textless}p style=``font-style: normal;'' {align=``JUSTIFY''{\textgreater}-} une {\'e}valuation avec des r{\'e}f{\'e}rences multicorpus qui permet d’observer le comportement d’un syst{\`e}me selon le type de corpus qu’il prend en entr{\'e}e,{\textless}/p{\textgreater} {\textless}p {align=``JUSTIFY''{\textgreater}{\textless}span} style=``font-style: normal;''{\textgreater}- une {\textless}/span{\textgreater}{\textless}span style=``font-size: small;''{\textgreater}{\textless}span style=``font-style: normal;''{\textgreater}{\'e}valuation avec une r{\'e}f{\'e}rence standard disponible publiquement qui permet d’observer son comportement par rapport {\`a} des outils de m{\{\\^e}}me famille. {\textless}/span{\textgreater}{\textless}/span{\textgreater}{\textless}/p{\textgreater}"
}

@InCollection{ veronis_alignement_2000,
	address = "Paris",
	edition = "Editions Herm{\`e}s",
	title = "Alignement de corpus multilingues",
	booktitle = "Ing{\'e}nierie des langues",
	publisher = "{J.-M.} Pierrel",
	author = "Jean V{\'e}ronis",
	year = "2000",
	pages = "151--171",
	annote = "{\textless}p{\textgreater}m{\'e}thodes statistiques :{\textless}/p{\textgreater} {{\textless}p{\textgreater}Dagan,} 1994{\textless}/p{\textgreater} {{\textless}p{\textgreater}Resnik,} 1997{\textless}/p{\textgreater} {{\textless}p{\textgreater}Jones,} 1997{\textless}/p{\textgreater} {{\textless}p{\textgreater}Choueka,} 2000{\textless}/p{\textgreater} {{\textless}p{\textgreater}Fung,} 2000{\textless}/p{\textgreater}"
}

@InProceedings{ tiedemann_opus_2004,
	address = "Lisbon, Portugal",
	series = "Parallel corpora",
	title = "The {OPUS} corpus - parallel and free",
	url = "http://stp.lingfil.uu.se/~joerg/published/lrec04_opus.pdf",
	abstract = "The {OPUS} corpus is a growing collection of translated documents collected from the internet. The current version contains about 30 million words in 60 languages. The entire corpus is sentence aligned and it also contains linguistic markup for certain languages.",
	booktitle = "Proceedings of the Fourth International Conference on Language Resources and Evaluation {(LREC'2004)}",
	author = "J{\"o}rg Tiedemann and Lars Nygard",
	month = may,
	year = "2004"
}

@InProceedings{ brin_anatomy_1998,
	address = "Brisbane, Australia",
	title = "The anatomy of a large-scale hypertextual Web search engine",
	url = "http://portal.acm.org/citation.cfm?id=297805.297827\&coll=GUIDE\&dl=GUIDE\&CFID=35720789\&CFTOKEN=54521891",
	booktitle = "Proceedings of the seventh international conference on World Wide Web 7",
	publisher = "Elsevier Science Publishers B. V.",
	author = "Sergey Brin and Lawrence Page",
	year = "1998",
	note = "{BRI} 98",
	keywords = "google, information retrieval, pagerank, search engines, world wide web",
	pages = "107--117"
}

@InProceedings{ tiedemann_word_2004,
	address = "Geneva, Switzerland",
	title = "Word to word alignment strategies",
	url = "http://portal.acm.org/citation.cfm?id=1220355.1220386\&coll=GUIDE\&dl=GUIDE\&CFID=76577594\&CFTOKEN=73477001",
	abstract = "Word alignment is a challenging task aiming at the identification of translational relations between words and multi-word units in parallel corpora. Many alignment strategies are based on links between single words. Different strategies can be used to find the optimal word alignment using such one-to-one word links including relations between multi-word units. In this paper seven algorithms are compared using a word alignment approach based on association clues and an {English-Swedish} bitext together with a handcrafted reference alignment used for evaluation.",
	booktitle = "Proceedings of the 20th international conference on Computational Linguistics",
	publisher = "Association for Computational Linguistics",
	author = "J{\"o}rg Tiedemann",
	year = "2004",
	pages = "212"
}

@Article{ ferrucci_uima:_2004,
	title = "{UIMA:} an architectural approach to unstructured information processing in the corporate research environment",
	volume = "10",
	shorttitle = "{UIMA}",
	url = "http://portal.acm.org/citation.cfm?id=1030318.1030325\&coll=GUIDE\&dl=GUIDE\&CFID=85648535\&CFTOKEN=64298784",
	abstract = "{IBM} Research has over 200 people working on Unstructured Information Management {(UIM)} technologies with a strong focus on Natural Language Processing {(NLP).} These researchers are engaged in activities ranging from natural language dialog, information retrieval, topic-tracking, named-entity detection, document classification and machine translation to bioinformatics and open-domain question answering. An analysis of these activities strongly suggested that improving the organization's ability to quickly discover each other's results and rapidly combine different technologies and approaches would accelerate scientific advance. Furthermore, the ability to reuse and combine results through a common architecture and a robust software framework would accelerate the transfer of research results in {NLP} into {IBM's} product platforms. Market analyses indicating a growing need to process unstructured information, specifically multilingual, natural language text, coupled with {IBM} Research's investment in {NLP,} led to the development of middleware architecture for processing unstructured information dubbed {UIMA.} At the heart of {UIMA} are powerful search capabilities and a data-driven framework for the development, composition and distributed deployment of {\textless}i{\textgreater}analysis engines{\textless}/i{\textgreater}. In this paper we give a general introduction to {UIMA} focusing on the design points of its analysis engine architecture and we discuss how {UIMA} is helping to accelerate research and technology transfer.",
	number = "3-4",
	journal = "Nat. Lang. Eng.",
	author = "David Ferrucci and Adam Lally",
	year = "2004",
	note = "{FER} 04",
	pages = "327--348"
}

@PhDThesis{ dejean_concepts_1998,
	title = "Concepts et algorithmes pour la d{\'e}couverte des structures formelles des langues",
	url = "http://tel.archives-ouvertes.fr/tel-00169572/en/",
	school = "Universit{\'e} de Caen",
	author = "Herv{\'e} D{\'e}jean",
	month = dec,
	year = "1998",
	note = "Que peut-on apprendre sur la structure d'une langue {\`a} partir d'un texte {\'e}crit dans cette langue, et ceci sans connaissance particuli{\`e}re sur celle-ci et avec l'aide (disons l'utilisation) d'un ordinateur? Voil{\`a} la question {\`a} laquelle nous avons essay{\'e} de r{\'e}pondre. Cette r{\'e}ponse peut {\{\\^e}}tre vue comme une continuation des travaux en analyse distributionnelle d{\'e}velopp{\'e}e par Zellig Harris. L'objectif de ce travail est donc de d{\'e}couvrir les structures formelles d'une langue en {\'e}tudiant ces r{\'e}gularit{\'e}s formelles contenues dans un corpus Notre m{\'e}thode de d{\'e}couverte se base sur une simple conception formelle de la langue: un objet lin{\'e}aire dans lequel les fronti{\`e}res (de d{\'e}but et de fin) des diff{\'e}rentes structures sont indiqu{\'e}es par des {\'e}l{\'e}ments caract{\'e}ristiques. Les structures ainsi identifi{\'e}es sont le syntagme simple (non r{\'e}cursif), et la proposition, structures {\`a} la fois multilingues et formelles. Ces indicateurs de fronti{\`e}res correspondent {\`a} des morph{\`e}mes (libres ou li{\'e}s) pour le syntagme, et {\`a} des morph{\`e}mes ou des syntagmes pour la proposition. {\`A} partir de ces structures th{\'e}oriques, nous construisons la liste de toutes les cat{\'e}gories qu'un {\'e}l{\'e}ment (morph{\`e}me ou mot) peut prendre. Une fois ces structures et cat{\'e}gories recens{\'e}es, nous construisons des contextes sp{\'e}cifiques {\`a} chaque cat{\'e}gorie afin de cat{\'e}goriser les {\'e}l{\'e}ments du texte. Nous obtenons donc un m{\'e}canisme permettant d'assigner {\`a} un {\'e}l{\'e}ment plusieurs cat{\'e}gories si cet {\'e}l{\'e}ment appara{\{\\^i}}t dans diff{\'e}rents contextes. Ces contextes sont construits {\`a} l'aide des {\'e}l{\'e}ments prototypiques de marqueurs de fronti{\`e}res de structures, identifiables gr{\{\\^a}}ce {\`a} leur position par rapport {\`a} la segmentation physique du texte (en particulier les ponctuations). Les r{\'e}sultats obtenus permettent la cat{\'e}gorisation des mots du corpus, ainsi qu'une segmentation partielle en syntagmes. La m{\'e}thode a {\'e}t{\'e} appliqu{\'e}e {\`a} une dizaine de langues comme le fran\c{c}ais, l'allemand, le turc, le vietnamien et le swahili."
}

@PhDThesis{ ozdowska_alibi_2006,
	address = "Toulouse",
	title = "{ALIBI,} un syst{\`e}me {d'ALIgnement} {BIlingue} {\`a} base de r{\`e}gles de propapagation syntaxique",
	url = "http://www.computing.dcu.ie/~sozdowska/publis.html",
	abstract = "L'alignement consiste en la mise en correspondance, dans un corpus parall{\`e}le compos {\'e} d'un texte S et de sa traduction T, de segments textuels qui sont potentiellement la traduction les uns des autres. Cette mise en correspondance peut s'eectuer {\`a} diff {\'e}rents niveaux de segmentation : paragraphes, phrases, syntagmes ou mots. D'abord purement statistiques, les syst{\`e}mes d'alignement au niveau des mots ou des syntagmes se sont progressivement enrichis en incorporant des connaissances linguistiques, notamment syntaxiques. Je m'int{\'e}resse {\`a} l'utilisation de relations de d{\'e}pendance bilingues en tant que g{\'e}n{\'e}rateur de liens d'alignement via le processus de propagation syntaxique. La principe en est le suivant : il s'agit de partir d'un couple de mots qui sont potentiellement traduction l'un de l'autre dans une biphrase et d'aligner les mots qui sont en relation syntaxique avec chaque {\'e}l{\'e}ment de ce couple. C'est ce principe de propagation, exprim{\'e} sous forme de r{\`e}gles d'alignement, qui est mis en ÷uvre dans le syst{\`e}me Alibi. Les r{\`e}gles d'alignement peuvent {\{\\^e}}tre d{\'e}nies manuellement sur la base d'une expertise linguistique et d'observations en corpus mais elles peuvent {\'e}galement {\{\\^e}}tre inf{\'e}r{\'e}es automatiquement par une technique d'apprentissage articiel, ce qui conf{\`e}re une plus-value ind{\'e}niable {\`a} la m{\'e}thode d{\'e}velopp{\'e}e en terme d'adaptabilit{\'e} et donc de portabilit{\'e}. Pour tenir compte de la dimension inter-corpus, ces deux dispositifs sont test{\'e}s sur trois corpus parall{\`e}les anglais/fran\c{c}ais : un corpus de textes scientiques, un corpus de textes institutionnels et un corpus de d{\'e}bats parlementaires. L'{\'e}valuation est eectu{\'e}e par rapport {\`a} des r{\'e}f{\'e}rences multi-juges construites {\`a} cet eet.",
	school = "Universit{\'e} de {Toulouse-Le} Mirail",
	author = "Sylwia Ozdowska",
	month = dec,
	year = "2006",
	keywords = "corpus, couple amorce, {\'e}valuation, techniques d'alignement"
}

@Article{ schmidt_data_2009,
	title = "A data structure for representing multi-version texts online",
	volume = "67",
	url = "http://portal.acm.org/citation.cfm?id=1523966",
	abstract = "The digitisation of cultural heritage and linguistics texts has long been troubled by the problem of how to represent overlapping structures arising from different markup perspectives ('overlapping hierarchies') or from different versions of the same work ('textual variation'). These two problems can be reduced to one by observing that every case of overlapping hierarchies is also a case of textual variation. Overlapping textual structures can be accurately modelled either as a minimally redundant directed graph, or, more practically, as an ordered list of pairs, each containing a set of versions and a fragment of text or data. This 'pairs-list' representation is provably equivalent to the graph representation. It can record texts consisting of thousands of versions or perspectives without becoming overloaded with data, and the most common operations on variant text, e.g. comparison between two versions, can be performed in linear time. This representation also separates variation or other overlapping structures from the document content, leading to a simplification of markup suitable for wiki-like web applications.",
	number = "6",
	journal = "Int. J. {Hum.-Comput.} Stud.",
	author = "Desmond Schmidt and Robert Colomb",
	year = "2009",
	keywords = "cultural heritage, electronic editions, markup, overlapping hierarchies, textual variation",
	pages = "497--514"
}

@InProceedings{ enright_fast_2007,
	address = "Rochester, New York",
	title = "A fast method for parallel document identification",
	url = "http://webdocs.cs.ualberta.ca/~kondrak/papers/hlt07.pdf",
	abstract = "We present a fast method to identify homogeneous parallel documents. The method is based on collecting counts of identical low-frequency words between possibly parallel documents. The candidate with the most shared low-frequency words is selected as the parallel document. The method achieved 99.96\% accuracy when tested on the {EUROPARL} corpus of parliamentary proceedings, failing only in anomalous cases of truncated or otherwise distorted documents. While other work has shown similar performance on this type of dataset, our approach presented here is faster and does not require training. Apart from proposing an efficient method for parallel document identification in a restricted domain, this paper furnishes evidence that parliamentary proceedings may be inappropriate for testing parallel document identification systems in general.",
	booktitle = "Human Language Technologies 2007: The Conference of the North American Chapter of the Association for Computational Linguistics; Companion Volume, Short Papers on {XX}",
	publisher = "Association for Computational Linguistics",
	author = "Jessica Enright and Grzegorz Kondrak",
	year = "2007",
	pages = "29--32"
}

@Article{ melamed_bitext_1999,
	title = "Bitext maps and alignment via pattern recognition",
	volume = "25",
	url = "http://portal.acm.org/citation.cfm?id=973215.973218\&coll=Portal\&dl=GUIDE\&CFID=78818668\&CFTOKEN=17474915",
	abstract = "Texts that are available in two languages (bitexts) are becoming more and more plentiful, both in private data warehouses and on publicly accessible sites on the World Wide Web. As with other kinds of data, the value of bitexts largely depends on the efficacy of the available data mining tools. The first step in extracting useful information from bitexts is to find corresponding words and/or text segment boundaries in their two halves (bitext {maps).This} article advances the state of the art of bitext mapping by formulating the problem in terms of pattern recognition. From this point of view, the success of a bitext mapping algorithm hinges on how well it performs three tasks: signal generation, noise filtering, and search. The Smooth Injective Map Recognizer {(SIMR)} algorithm presented here integrates innovative approaches to each of these tasks. Objective evaluation has shown that {SIMR's} accuracy is consistently high for language pairs as diverse as {French/English} and {Korean/English.} If necessary, {SIMR's} bitext maps can be efficiently converted into segment alignments using the Geometric Segment Alignment {(GSA)} algorithm, which is also presented {here.SIMR} has produced bitext maps for over 200 megabytes of {French-English} bitexts. {GSA} has converted these maps into alignments. Both the maps and the alignments are available from the Linguistic Data Consortium.",
	number = "1",
	journal = "Comput. Linguist.",
	author = "I. Dan Melamed",
	year = "1999",
	pages = "107--130",
	annote = "{\textless}p{\textgreater}importance de la combinaison des diff{\'e}rentes sources d'informations (lexique, cognates, longueur de phrases, fr{\'e}quence des appariemments voir Langlais, 1997{\textless}/p{\textgreater} {\textless}p{\textgreater}+ combiner dans un algorithme avec des m{\'e}thodes de flitrage et de r{\'e}duction de l'espace de recherche{\textless}/p{\textgreater} {\textless}p{\textgreater}voir Langlais, 1997{\textless}/p{\textgreater}"
}

@Misc{ brixtel_expose_2009,
	address = "Caen",
	type = "ppt",
	title = "Expos{\'e} - {\'e}quipe {ISLanD}",
	author = "Romain Brixtel",
	year = "2009",
	keywords = "alignement d'alineas, alignement de chunks, alignement de documents, multi-alin{\'e}as, multidocuments, multidocuments {\'e}clat{\'e}s"
}

@TechReport{ lerman_analyse_1984,
	address = "Rennes, France",
	title = "Analyse d'un algorithme de classification hi{\'e}rarchique 'en parall{\`e}le' pour le traitement de gros ensembles",
	url = "http://hal.archives-ouvertes.fr/inria-00076218/en/",
	author = "{Isra{\"e}l-C{\'e}sar} Lerman and Philippe Peter",
	year = "1984",
	note = "Disponible dans les fichiers attach{\'e}s {\`a} ce document"
}

@Article{ ejerhed_finite_1996,
	title = "Finite state segmentation of discourse into clauses",
	volume = "2",
	url = "http://portal.acm.org/citation.cfm?id=974708",
	abstract = "The paper presents background and motivation for a processing model that segments discourse into units that are simple, non-nested clauses, prior to the recognition of clause internal phrasal constituents, and experimental results in support of this model. One set of results is derived from a statistical reanalysis of the Swedish empirical data in Strangert, Ejerhed and Huber 1993 concerning the linguistic structure of major prosodic units. The other set of results is derived from experiments in segmenting part of speech annotated Swedish text corpora into clauses, using a new clause segmentation algorithm. The clause segmented corpus data is taken from the Stockholm Ume{\aa} Corpus {(SUC),} 1 M words of Swedish texts from different genres, part of speech annotated by hand, and from the Ume{\aa} corpus {DAGENS} {INDUSTRI} 1993 {(DI93),} 5 M words of Swedish financial newspaper text, processed by fully automatic means consisting of tokenizing, lexical analysis, and probabilistic {POS} tagging. The results of these two experiments show that the proposed clause segmentation algorithm is 96\% correct when applied to manually tagged text, and 91\% correct when applied to probabilistically tagged text.",
	number = "4",
	journal = "Nat. Lang. Eng.",
	author = "Eva Ejerhed",
	year = "1996",
	pages = "355--364"
}

@Misc{ _linguist_????,
	title = "Linguist List - Web Resource Listings",
	url = "http://linguistlist.org/sp/GetWRListings.cfm?WRAbbrev=Texts\#173",
	howpublished = "{http://linguistlist.org/sp/GetWRListings.cfm?WRAbbrev=Texts\#173}"
}

@InProceedings{ kraif_architecture_1999,
	address = "facult{\'e} des Lettres Arts et Sciences humaines, Universit{\'e} de Nice Sophia Antipolis",
	title = "Architecture d’un syst{\`e}me d’alignement : {\'e}tude pour une int{\'e}gration optimale des indices d’alignement",
	booktitle = "Actes des Journ{\'e}es internationales de linguistique appliqu{\'e}e",
	author = "Olivier Kraif",
	year = "1999",
	pages = "161--164"
}

@Article{ ozdowska_inference_2006,
	title = "Inf{\'e}rence de r{\`e}gles de propagation syntaxique pour l'alignement de mots",
	volume = "47",
	url = "http://www.computing.dcu.ie/~sozdowska/publis.html",
	abstract = "Cet article pr{\'e}sente et {\'e}value une approche originale d’alignement automatique de bitexte au niveau des mots. Pour cela, elle tire parti d’une analyse syntaxique en d{\'e}pendances et utilise une technique d’apprentissage artificiel, la programmation logique inductive, pour apprendre des r{\`e}gles dites de propagation. Celles-ci s’appuient sur les informations syntaxiques connues pour aligner les mots avec grande pr{\'e}cision. La m{\'e}thode est enti{\`e}rement automatique et ne requiert que peu de donn{\'e}es d’entra{\{\\^i}}nement ; les r{\'e}sultats pr{\'e}sent{\'e}s montrent qu’elle se compare aux meilleures techniques existantes. Enfin, l’examen des r{\`e}gles inf{\'e}r{\'e}es permet d’identifier facilement les cas d’isomorphismes syntaxiques entre les deux langues trait{\'e}es. {MOTS-CL{\'E}S} : alignement de mots, apprentissage artificiel, programmation logique inductive, analyse syntaxique",
	number = "1",
	journal = "Traitement Automatique des Langues",
	author = "Sylwia Ozdowska and Vincent Claveau",
	year = "2006",
	pages = "167--186"
}

@InProceedings{ kraif_identification_1999,
	address = "Carg{\`e}se, France",
	title = "Identification des cognats et alignement bi-textuel : une {\'e}tude empirique",
	booktitle = "Actes de la 6{\`e}me conf{\'e}rence annuelle sur le Traitement Automatique des Langues Naturelles",
	author = "Olivier Kraif",
	year = "1999",
	pages = "205--214"
}

@InProceedings{ zhou_bilingual_2004,
	address = "The Hague, Netherlands",
	title = "Bilingual chunk alignment in statistical machine translation",
	url = "http://cat.inist.fr/?aModele=afficheN\&cpsidt=17523633",
	abstract = "In this paper a new algorithm called {Multi-Layer} Filtering {(MLF)} is proposed for extracting bilingual alignment chunks automatically from a Chinese {-English} parallel corpus. Multiple layers are used to extract bilingual chunks according to different features of chunks in the bilingual corpus. And the alignment chunks are one-to-one corresponding with each other. The chunking and alignment algorithm doesn 't rely on the information from tagging, parsing, syntax analyzing or segmenting for Chinese corpus as most conventional algorithms do. Preliminary experimental results show that the algorithm achieves a good performance in chunking and alignment. Moreover, the translations generated by this algorithm are much better than the results generated by the baseline (word-based statistical machine translation).",
	booktitle = "Proceedings of the 2004 {IEEE} international conference on systems, man \& cybernetics, 10-13 october",
	author = "Yu Zhou and Chengqing Zhong and Bo Xu",
	year = "2004"
}

@InProceedings{ emmanuel_multilingual_2006,
	address = "Sydney, Australia",
	title = "Multilingual lexical database generation from parallel texts in 20 European languages with endogenous resources",
	url = "http://portal.acm.org/citation.cfm?id=1273108",
	abstract = "This paper deals with multilingual database generation from parallel corpora. The idea is to contribute to the enrichment of lexical databases for languages with few linguistic resources. Our approach is endogenous: it relies on the raw texts only, it does not require external linguistic resources such as stemmers or taggers. The system produces alignments for the 20 European languages of the {'Acquis} Communautaire' Corpus.",
	booktitle = "Proceedings of the {COLING/ACL} on Main conference poster sessions",
	publisher = "Association for Computational Linguistics",
	author = "Giguet Emmanuel and Luquet {Pierre-Sylvain}",
	year = "2006",
	keywords = "cosinus (cosine)",
	pages = "271--278"
}

@PhDThesis{ nguyen_outils_2006,
	address = "Nancy, France",
	title = "Outils et ressources linguistiques pour l'alignement de textes multilingues fran\c{c}ais-vietnamiens",
	url = "http://tel.archives-ouvertes.fr/docs/00/10/55/92/PDF/these_NguyenThiMinhHuyen.pdf",
	abstract = "Le travail pr{\'e}sent{\'e} dans ce m{\'e}moire porte sur la construction des outils et ressources linguistiques pour les t{\{\\^a}}ches fondamentales de traitement automatique de la langue vietnamienne, dans un contexte monolingue ainsi que multilingue. Nous pr{\'e}sentons pour cette langue encore peu {\'e}tudi{\'e}e des solutions possibles aux probl{\`e}mes d’annotation morpho-syntaxique (d{\'e}finition de descripteurs lexicaux ``< de r{\'e}f{\'e}rence ''>, construction d’un lexique avec ces descriptions, des outils de segmentation et d’{\'e}tiquetage lexical), d’analyse syntaxique (premi{\`e}re tentative de mod{\'e}lisation de la grammaire vietnamienne en employant le formalisme {TAG,} cadre de construction de ressources pour l’analyse syntaxique) et d’alignement multilingue (constitution d’un corpus multilingue, d{\'e}veloppement d’un syst{\`e}me d’alignement multilingue). Afin d’assurer la r{\'e}utilisabilit{\'e} des travaux r{\'e}alis{\'e}s, et dans l’espoir de les voir stimuler le d{\'e}veloppement du {TAL} au Vietnam, nous avons apport{\'e} une attention particuli{\`e}re aux questions de normalisation de la gestion des ressources linguistiques.",
	school = "Universit{\'e} Henri Poincar{\'e} - Laboratoire Lorrain de Recherche en Informatique et ses Applications {(LORIA°)} - {UMR} 7503",
	author = "Thị Minh Huyền Nguyen",
	year = "2006",
	keywords = "alignement multilingue, analyse syntaxique, annotation linguistique, corpus annot{\'e}s, {\'e}tiquetage lexical / morphosyntaxique, grammaire d’arbres adjoints, lexique, normalisation, partie du discours, ressources linguistiques, segmentation, traitement automatique des langues, vietnamien"
}

@InProceedings{ zimina_alignements_2006,
	address = "Universit{\'e} Paris 12 {Val-de-Marne}",
	title = "Alignements et autres types de parall{\'e}lismes dans les corpus",
	booktitle = "Traitements automatis{\'e}s des discours politiques. Objets nouveaux, nouvelles m{\'e}thodes",
	author = "Maria Zimina and Serge Fleury",
	month = jan,
	year = "2006"
}

@InProceedings{ tiedemann_combining_2003,
	address = "Budapest, Hungary",
	title = "Combining clues for word alignment",
	isbn = "1-333-56789-0",
	url = "http://portal.acm.org/citation.cfm?id=1067807.1067852\&coll=GUIDE\&dl=GUIDE\&CFID=76577594\&CFTOKEN=73477001",
	abstract = "In this paper, a word alignment approach is presented which is based on a combination of clues. Word alignment clues indicate associations between words and phrases. They can be based on features such as frequency, part-of-speech, phrase type, and the actual wordform strings. Clues can be found by calculating similarity measures or learned from word aligned data. The clue alignment approach, which is proposed in this paper, makes it possible to combine association clues taking different kinds of linguistic information into account. It allows a dynamic tokenization into token units of varying size. The approach has been applied to an {English/Swedish} parallel text with promising results.",
	booktitle = "Proceedings of the tenth conference on European chapter of the Association for Computational Linguistics - Volume 1",
	publisher = "Association for Computational Linguistics",
	author = "J{\"o}rg Tiedemann",
	year = "2003",
	pages = "339--346"
}

@Article{ simard_bilingual_1996,
	title = "Bilingual Sentence Alignment: Balancing Robustness And Accuracy",
	volume = "13",
	shorttitle = "Bilingual Sentence Alignment",
	url = "http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.54.6778",
	journal = "{IN} {PROCEEDINGS} {OF} {THE} {SECOND} {CONFERENCE} {OF} {THE} {ASSOCIATION} {FOR} {MACHINE} {TRANSLATION} {IN} {THE} {AMERICAS} {(AMTA}",
	author = "Michel Simard and Pierre Plamondon",
	year = "1996",
	pages = "59----80",
	annote = "{{\textless}p{\textgreater}Les} m{\'e}thodes de programmation dynamique qui reposent sur des points d'ancrage comme les paragraphes {(Brown,} 1991) ou des informations lexicales comme les cognats {(Simard,} 1992) : ces m{\'e}thodes ne sont pas robustes aux traduction non litt{\'e}rales et aux suppressions.{\textless}/p{\textgreater}"
}

@InProceedings{ morin_extraction_2004,
	address = "F{\`e}s",
	title = "Extraction de terminologies bilingues {\`a} partir de corpus comparables",
	url = "http://aune.lpl.univ-aix.fr/jep-taln04/proceed/actes/taln2004-Fez/Morin-etal.pdf",
	booktitle = "{TALN} 2004, Traitement Automatique de {l’Arabe,} 19-22 avril",
	author = "Emmanuel Morin and Samuel {Dufour-Kowalski} and B{\'e}atrice Daille",
	year = "2004",
	pages = "10"
}

@InCollection{ lebart_ements_1994,
	edition = "{DUNOD}",
	title = "El{\'e}ments caract{\'e}ristiques, r{\'e}ponses ou textes modaux",
	url = "http://www.cavi.univ-paris3.fr/lexicometrica/livre/st94/st94-tdm.html",
	booktitle = "{STATISTIQUE} {TEXTUELLE}",
	author = "Ludovic Lebart and Andr{\'e} Salem",
	year = "1994",
	note = "La statistique textuelle, en plein d{\'e}veloppement, est {\`a} la crois{\'e}e de plusieurs disciplines : la statistique classique, l'analyse du discours, l'informatique, le traitement des enqu{\{\\^e}}tes. En effet, chercheurs et praticiens ont aujourd'hui {\`a} faire face {\`a} un double d{\'e}veloppement, d'une part celui des textes provenant des enqu{\{\\^e}}tes, des entretiens, des archives, des bases documentaires, d'autre part, celui des outils informatiques de saisie et de gestion des textes. La statistique textuelle se veut pr{\'e}cis{\'e}ment un outil destin{\'e} {\`a} parfaire l'analyse, la description, la comparaison, en un mot, le traitement des textes. Ce livre, illustr{\'e} d'exemples nombreux, pr{\'e}sente les concepts de base et les fondements des m{\'e}thodes de la statistique textuelle. Il combine une approche p{\'e}dagogique des outils et un expos{\'e} sur l'{\'e}tat de l'art de cette discipline.",
	keywords = "Statistique textuelle",
	pages = "171--198"
}

@InProceedings{ fung_k-vec:_1994,
	address = "Kyoto, Japan",
	title = "K-vec: a new approach for aligning parallel texts",
	shorttitle = "K-vec",
	url = "http://portal.acm.org/citation.cfm?id=991328",
	abstract = "Various methods have been proposed for aligning texts in two or more languages such as the Canadian Parliamentary Debates {(Hansards).} Some of these methods generate a bilingual lexicon as a by-product. We present an alternative alignment strategy which we call K-vec, that starts by estimating the lexicon. For example, it discovers that the English word fisheries is similar to the French p{\{\\^e}}ches by noting that the distribution of fisheries in the English text is similar to the distribution of p{\{\\^e}}'ches in the French. K-vec does not depend on sentence boundaries.",
	booktitle = "Proceedings of the 15th conference on Computational linguistics - Volume 2",
	publisher = "Association for Computational Linguistics",
	author = "Pascale Fung and Kenneth Ward Church",
	year = "1994",
	keywords = "K-vec Algorithm",
	pages = "1096--1102",
	annote = "{\textless}p{\textgreater}alignement partiel de mots comme point d'ancrage {\`a} un alignement de phrases{\textless}/p{\textgreater} {\textless}p{\textgreater}ali bruts avec programmatio dynamique{\textless}/p{\textgreater} {\textless}p{\textgreater}attention pas de cognats entre certains groupes de langues (contr. Simard 1992){\textless}/p{\textgreater} {\textless}p{\textgreater}k-vec =\&gt; k sortes de partitions du corpus{\textless}/p{\textgreater} {\textless}p{\textgreater} {\textless}/p{\textgreater} {\textless}p{\textgreater}cf kay, 1988{\textless}/p{\textgreater} {{\textless}p{\textgreater}Debili,} 1992{\textless}/p{\textgreater} {{\textless}p{\textgreater}Dagan,} 1993{\textless}/p{\textgreater} {\textless}p{\textgreater} {\textless}/p{\textgreater}"
}

@InProceedings{ gale_identifying_1991,
	address = "Pacific Grove, California",
	title = "Identifying word correspondence in parallel texts",
	url = "http://portal.acm.org/citation.cfm?id=112405.112428\&coll=Portal\&dl=GUIDE\&CFID=76577594\&CFTOKEN=73477001",
	abstract = "Researchers in both machine translation (e.g., Brown {\textless}i{\textgreater}et al{\textless}/i{\textgreater}, 1990) and bilingual lexicography (e.g., Klavans and Tzoukermann, 1990) have recently become interested in studying {\textless}i{\textgreater}parallel texts{\textless}/i{\textgreater} (also known as {\textless}i{\textgreater}bilingual corpora{\textless}/i{\textgreater}), bodies of text such as the Canadian Hansards (parliamentary debates) which are available in multiple languages (such as French and English). Much of the current excitement surrounding parallel texts was initiated by Brown {\textless}i{\textgreater}et al.{\textless}/i{\textgreater} (1990), who outline a self-organizing method for using these parallel texts to build a machine translation system.",
	booktitle = "Proceedings of the workshop on Speech and Natural Language",
	publisher = "Association for Computational Linguistics",
	author = "William A. Gale and Kenneth W. Church",
	year = "1991",
	pages = "152--157"
}

@InProceedings{ triantafyllou_alignment_2000,
	address = "International conference at the University of Exeter, {UK.}",
	title = "An alignment architecture for Translation Memory Bootstrapping",
	url = "http://www.mt-archive.info/authors-T.htm",
	booktitle = "Machine Translation Archive",
	author = "Ioannis Triantafyllou and Iason Demiros and Christos Malavazos and Stelios Piperidis",
	year = "2000"
}

@Article{ durieux_foisonnement_1990,
	title = "Le foisonnement en traduction technique d’anglais en fran\c{c}ais",
	volume = "35",
	issn = "0026-0452",
	url = "http://id.erudit.org/iderudit/002689ar",
	number = "1",
	journal = "Meta",
	author = "Christine {DURIEUX}",
	year = "1990",
	pages = "55--60"
}

@PhDThesis{ kraif_constitution_2001,
	title = "Constitution et exploitation de bi-textes pour l’aide {\`a} la traduction",
	school = "Universit{\'e} de Nice Sophia- Antipolis",
	author = "Olivier Kraif",
	year = "2001"
}

@Article{ cortes_algorithmes_2009,
	title = "Des algorithmes d'apprentissage pour mieux classifier",
	number = "386",
	journal = "Pour la science",
	author = "Corinna Cortes and Patrick Haffner and Mehryar Mohri",
	year = "2009",
	pages = "38--44"
}

@Article{ harris_bi-text_1988,
	title = "Bi-text, a new concept in translation theory",
	volume = "54",
	url = "http://en.wikipedia.org/wiki/Parallel_text",
	journal = "Language Monthly {(UK)}",
	author = "Brian Harris",
	month = mar,
	year = "1988"
}

@Misc{ _jadt_????,
	title = "jadt maurel - Google Scholar",
	url = "http://scholar.google.fr/scholar?hl=en\&q=jadt+maurel\&btnG=Search\&as_sdt=2000\&as_ylo=\&as_vis=0",
	howpublished = "{http://scholar.google.fr/scholar?hl=en\&q=jadt+maurel\&btnG=Search\&as\_sdt=2000\&as\_ylo=\&as\_vis=0}"
}

--word_1997,

address = {Madrid, Spain},

title = {A word-to-word model of translational equivalence},

url = {http://portal.acm.org/citation.cfm?id=979617.979680\&coll=GUIDE\&dl=GUIDE\&CFID=76577594\&CFTOKEN=73477001},

abstract = {Many multilingual {NLP} applications need to translate words between different languages, but cannot afford the computational expenses of inducing or applying a full translation model. For theses applications, we have designed a fast algorithm for estimating a partial translation model, which accounts for translational equivalence only at the word level. The model's precision/recall trade-off can be directly controlled via one threshold parameter. This feature makes the model more suitable for applications that are not fully statistical. The model's hidden parameters can be easily conditioned on information extrinsic to the model, providing an easy way to integrate pre-existing knowledge such as part-of-speech, dictionaries, word order, etc., Our model can link word tokens in parallel texts as well as other translation models in the literature. Unlike other translation models, it can automatically produce dictionary-sized translation lexicons, and it can do so with over 99\% accuracy.},

booktitle = {Proceedings of the eighth conference on European chapter of the Association for Computational Linguistics},

publisher = {Association for Computational Linguistics},

author = {I. Dan Melamed},

year = {1997},

pages = {490--497}
},

@Article{ yamamoto_using_2001,
	title = "Using suffix arrays to compute term frequency and document frequency for all substrings in a corpus",
	volume = "27",
	url = "http://portal.acm.org/citation.cfm?id=972779",
	abstract = "Bigrams and trigrams are commonly used in statistical natural language processing; this paper will describe techniques for working with much longer n-grams. Suffix arrays {(Manber} and Myers 1990) were first introduced to compute the frequency and location of a substring (n-gram) in a sequence (corpus) of length N. To compute frequencies over all {N(N} + 1)/2 substrings in a corpus, the substrings are grouped into a manageable number of equivalence classes. In this way, a prohibitive computation over substrings is reduced to a manageable computation over classes. This paper presents both the algorithms and the code that were used to compute term frequency (tf) and document frequency (dr)for all n-grams in two large corpora, an English corpus of 50 million words of Wall Street Journal and a Japanese corpus of 216 million characters of Mainichi {Shimbun.The} second half of the paper uses these frequencies to find ``interesting'' substrings. Lexicographers have been interested in n-grams with high mutual information {(MI)} where the joint term frequency is higher than what would be expected by chance, assuming that the parts of the n-gram combine independently. Residual inverse document frequency {(RIDF)} compares document frequency to another model of chance where terms with a particular term frequency are distributed randomly throughout the collection. {MI} tends to pick out phrases with noncompositional semantics (which often violate the independence assumption) whereas {RIDF} tends to highlight technical terminology, names, and good keywords for information retrieval (which tend to exhibit nonrandom distributions over documents). The combination of both {MI} and {RIDF} is better than either by itself in a Japanese word extraction task.",
	number = "1",
	journal = "Comput. Linguist.",
	author = "Mikio Yamamoto and Kenneth W. Church",
	year = "2001",
	pages = "1--30"
}

@InProceedings{ kraif_quattendre_2006,
	address = "Universit{\'e} Catholique de Lyon",
	title = "Qu'attendre de l'alignement de corpus multilingues ?",
	author = "Olivier Kraif",
	year = "2006"
}

@Article{ brown_statistical_1990,
	title = "A statistical approach to machine translation",
	volume = "16",
	url = "http://portal.acm.org/citation.cfm?id=92858.92860\&coll=Portal\&dl=GUIDE\&CFID=76577594\&CFTOKEN=73477001",
	abstract = "In this paper, we present a statistical approach to machine translation. We describe the application of our approach to translation from French to English and give preliminary results.",
	number = "2",
	journal = "Comput. Linguist.",
	author = "Peter F. Brown and John Cocke and Stephen A. Della Pietra and Vincent J. Della Pietra and Fredrick Jelinek and John D. Lafferty and Robert L. Mercer and Paul S. Roossin",
	year = "1990",
	pages = "79--85"
}

@InProceedings{ zimina_topographie_2006,
	address = "institut sup{\'e}rieur de traducteurs et interpr{\`e}tes {(ISTI),} Bruxelles {(Belgique)}",
	title = "Topographie bi-textuelle et approches quantitatives de l'extraction de ressources traductionnelles {\`a} partir de corpus parall{\`e}les",
	abstract = "La notion de corpus parall{\`e}le comportant plusieurs volets qui correspondent chacun {\`a} une version d’un m{\{\\^e}}me texte dans deux ou plusieurs langues diff{\'e}rentes, renvoie {\`a} des situations connues de coexistence de textes pr{\'e}sentant des liens forts dans leur structuration. Les m{\'e}thodes quantitatives permettent d’acc{\'e}der {\`a} de nouvelles dimensions d’analyse de ces corpus pluritextuels. Les perspectives ouvertes par cette approche offrent aux traducteurs, terminologues, lexicographes, etc., des moyens automatis{\'e}s pour explorer la structure des correspondances lexicales par la navigation textom{\'e}trique en corpus. Mots-cl{\'e}s : bitexte, corpus parall{\`e}les, statistique textuelle, textom{\'e}trie, topographie textuelle.",
	booktitle = "Actes des 7es Journ{\'e}es scientifiques du R{\'e}seau de chercheurs {``Lexicologie,} Terminologie, Traduction",
	author = "Maria Zimina",
	year = "2006",
	keywords = "bitexte, corpus parall{\`e}les, statistique textuelle, textom{\'e}trie, topographie textuelle",
	pages = "175--186"
}

@PhDThesis{ _coll._????,
	title = "Coll. {UIMA} : Combining {NER} Systems via a {UIMA-based} platform - 10es {RMLL} {\`a} Nantes du 7 au 11 juillet 2009",
	url = "http://2009.rmll.info/Coll-UIMA-Combining-NER-Systems.html"
}

@InProceedings{ gaillard_combining_2009,
	address = "Nantes, France",
	title = "Combining {NER} Systems via a {UIMA-based} platform",
	url = "http://2009.rmll.info/Coll-UIMA-Combining-NER-Systems.html",
	booktitle = "1st French-speaking meeting around the framework Apache {UIMA}",
	author = "Baptiste Gaillard and Sylvie {Guillemin-Lanne} and Guillaume Jacquet and Claude Martineau and Aur{\'e}lie Migeotte",
	month = jul,
	year = "2009",
	note = "{GAI} 09",
	keywords = "ie, ner, ontology, uima"
}

@Misc{ vespe_displaying_????,
	title = "{DISPLAYING} {FACTS} {ON} A {LINEAR} {GRAPH}",
	author = "D. J Vespe and A. W Hogue and A. Kehlenbeck and M. Gordon and J. C Reynar and D. B Alpert",
	note = "{VES} 07"
}

@Misc{ _machine_????,
	title = "Machine Translation Archive: authors O",
	url = "http://www.mt-archive.info/authors-O.htm",
	howpublished = "{http://www.mt-archive.info/authors-O.htm}"
}

@InProceedings{ kraif_extraction_2006,
	address = "Lyon",
	title = "Extraction automatique de lexique bilingue : application pour la recherche d'exemples en lexicographie",
	abstract = "Avec le d{\'e}veloppement du Web, les corpus multilingues parall{\`e}les en domaine sp{\'e}cialis{\'e} sont de plus en plus accessibles : un grand nombre de textes sont disponibles en ligne, qu'il s'agisse de documentations techniques (projets Open Source, corpus {OPUS),} de documents juridiques ou institutionnels {(ONU,} Acquis communautaire, Hansard, etc.), de rapports {\'e}manant d'organisations internationales {(OIT,} {OMS,} etc.), d'articles encyclop{\'e}diques {(Wikis)} ou de textes litt{\'e}raires diffus{\'e}s par des projets de num{\'e}risation {\`a} grande {\'e}chelle {(Projet} Gutenberg, {ABU,} etc.). Par ailleurs, des outils destin{\'e}s aux traducteurs ou aux linguistes {(Trados} Winalign, Giza++, Alinea, etc.) permettent de r{\'e}aliser l'alignement de ces corpus au niveau des phrases. Comme nous l'avons montr{\'e} {(Kraif} \& Chen, 2004), il est possible d'extraire automatiquement de ces corpus align{\'e}s des correspondances lexicales relativement fiables pour les unit{\'e}s portant un sens plein. Ce type d'extraction n{\'e}cessite la constitution d'un corpus parall{\`e}le suffisamment grand, de l'ordre d'un million de mots par langue, mais aucune autre ressource linguistique n'est requise. En outre, les m{\'e}thodes employ{\'e}es, qui se basent sur des indices superficiels (positions dans les phrases, ressemblances formelles, distributions dans les corpus) sont applicables {\`a} des couples de langues quelconques, et produisent des r{\'e}sultats int{\'e}ressants m{\{\\^e}}me sur des langues g{\'e}n{\'e}tiquement et typologiquement {\'e}loign{\'e}es, comme le fran\c{c}ais et l'arabe ou le chinois {(Chiao} et al., {\`a} para{\{\\^i}}tre). Apr{\`e}s avoir bri{\`e}vement d{\'e}crit les m{\'e}thodes permettant une telle extraction, nous chercherons {\`a} d{\'e}gager des pistes dans la perspective d'une utilisation lexicographique de ces correspondances. Nous montrerons, {\`a} partir des fonctionnalit{\'e}s d{\'e}velopp{\'e}es dans le logiciel Alinea, comment le lexique bilingue automatiquement extrait, bien qu'il constitue une donn{\'e}e brute partiellement bruit{\'e}e, constitue n{\'e}anmoins une ressource int{\'e}ressante pour la recherche d'exemples {\`a} travers un corpus bilingue. Nous verrons {\'e}galement comment des outils de concordance bilingue permettent d'aborder la recherche de constructions polylexicales, gr{\{\\^a}}ce {\`a} un langage de requ{\{\\^e}}te sophistiqu{\'e}, susceptible d'int{\'e}grer des crit{\`e}res au niveau des formes, des lemmes et des traits morphosyntaxiques, en utilisant le formalisme des expressions r{\'e}guli{\`e}res.",
	author = "Olivier Kraif",
	year = "2006"
}

@InProceedings{ church_char_align:_1993,
	address = "Stroudsburg, {PA,} {USA}",
	series = "{ACL} '93",
	title = "Char\_align: a program for aligning parallel texts at the character level",
	location = "Columbus, Ohio",
	shorttitle = "Char\_align",
	url = "http://dx.doi.org/10.3115/981574.981575",
	doi = "http://dx.doi.org/10.3115/981574.981575",
	abstract = "There have been a number of recent papers on aligning parallel texts at the sentence level, e.g., Brown et al (1991), Gale and Church (to appear), Isabelle (1992), Kay and R{\"o}senschein (to appear), Simard et al (1992), {Warwick-Armstrong} and Russell (1990). On clean inputs, such as the Canadian Hansards, these methods have been very successful (at least 96\% correct by sentence). Unfortunately, if the input is noisy (due to {OCR} and/or unknown markup conventions), then these methods tend to break down because the noise can make it difficult to find paragraph boundaries, let alone sentences. This paper describes a new program, char\_align, that aligns texts at the character level rather than at the sentence/paragraph level, based on the cognate approach proposed by Simard et al.",
	booktitle = "Proceedings of the 31st annual meeting on Association for Computational Linguistics",
	publisher = "Association for Computational Linguistics",
	author = "Kenneth Ward Church",
	year = "1993",
	note = "{ACM} {ID:} 981575",
	pages = "1--8",
	annote = "{\textless}p{\textgreater}difficult{\'e} d'identifier les limites de phrases dans un doc bruit{\'e} avec un dispositif {OCR{\textless}/p{\textgreater}} {{\textless}p{\textgreater}Id{\'e}e} que beaucoup de cognats entre les langues indo europ{\'e}ennes (char\_align diff de K-vec){\textless}/p{\textgreater} {\textless}p{\textgreater}char\_align = m{\'e}thode design{\'e}e pour les langues indo europ{\'e}enes{\textless}/p{\textgreater} {\textless}p{\textgreater}objectif : r{\'e}aliser un ali brut avec programmation dynamique.{\textless}/p{\textgreater}"
}

@InCollection{ voyatzi_traitement_2010,
	title = "Traitement de la modalit{\'e} {``Texte''}",
	booktitle = "S{\'e}mantique et multimodalit{\'e} en analyse de l'information",
	publisher = "Herm{\`e}s, {\`a} para{\{\\^i}}tre",
	author = "Stavroula Voyatzi and Caroline Brun and Nicolas Dessaigne and Maud Ehrmann and Sylvie {Guillemin-Lanne} and Guillaume Jacquet and Aur{\'e}lie Migeotte",
	year = "2010",
	note = "{VOY} 10"
}

@Article{ mcnamee_character_2004,
	title = "Character {N-Gram} Tokenization for European Language Text Retrieval",
	volume = "7",
	issn = "1386-4564",
	url = "http://portal.acm.org/citation.cfm?id=961294.961313",
	doi = "10.1023/B:INRT.0000009441.78971.be",
	abstract = "The {Cross-Language} Evaluation Forum has encouraged research in text retrieval methods for numerous European languages and has developed durable test suites that allow language-specific techniques to be investigated and compared. The labor associated with crafting a retrieval system that takes advantage of sophisticated linguistic methods is daunting. We examine whether language-neutral methods can achieve accuracy comparable to language-specific methods with less concomitant software complexity. Using the {CLEF} 2002 test set we demonstrate empirically how overlapping character n-gram tokenization can provide retrieval accuracy that rivals the best current language-specific approaches for European languages. We show that n \&equals; 4 is a good choice for those languages, and document the increased storage and time requirements of the technique. We report on the benefits of and challenges posed by n-grams, and explain peculiarities attendant to bilingual retrieval. Our findings demonstrate clearly that accuracy using n-gram indexing rivals or exceeds accuracy using unnormalized words, for both monolingual and bilingual retrieval.",
	journal = "Information Retrieval",
	author = "Paul Mcnamee and James Mayfield",
	month = jan,
	year = "2004",
	note = "{ACM} {ID:} 961313",
	keywords = "algorithms, character n-grams, cross language evaluation forum",
	pages = "73--97"
}

@InProceedings{ kadri_traduction_2004,
	address = "F{\`e}s",
	title = "Traduction des requ{\{\\^e}}tes pour la recherche d’information translinguistique anglais-arabe",
	url = "http://www.afcp-parole.org/doc/Archives_JEP/2004_XXVe_JEP_Fes/actes/arabe.htm",
	abstract = "Nous traitons dans cet article le probl{\`e}me de la traduction des requ{\{\\^e}}tes pour la Recherche {d’Information} Translinguistique {(RIT).} Le probl{\`e}me de la {RIT} consiste {\`a} trouver des documents en arabe avec des requ{\{\\^e}}tes en anglais. La traduction des requ{\{\\^e}}tes est une t{\{\\^a}}che essentielle. Notre approche de traduction de requ{\{\\^e}}tes pour la {RIT} est bas{\'e}e sur l’entra{\{\\^i}}nement d’un mod{\`e}le de traduction statistique sur un corpus de textes parall{\`e}les extraits du Web. D’autres m{\'e}thodes de traduction bas{\'e}es sur les textes parall{\`e}les et les dictionnaires bilingues sont aussi propos{\'e}es. Une attention particuli{\`e}re sera mise sur le traitement morphologique de l'arabe pour la lemmatisation. Nos exp{\'e}rimentations montrent que, si on dispose de ressources multiples pour la traduction de requ{\{\\^e}}te, leur combinaison am{\'e}liore grandement la performance de la Recherche {d'Information} {(RI).}",
	booktitle = "{JEP-TALN} 2004, Traitement Automatique de {l’Arabe,} 19-22 avril",
	author = "Youssef Kadri and {Jian-Yun} Nie",
	year = "2004"
}

@InProceedings{ bourdaillet_alignment_2007,
	address = "Hyderabad, India",
	title = "Alignment of noisy unstructured data",
	url = "http://research.ihost.com/and2007/cd/Proceedings_files/p139.pdf",
	abstract = "This paper describes a textual aligner named {MEDITE} whose specificity is the detection of moves. It was developed to solve a problem from textual genetic criticism, a humanities discipline that compares different versions of authors’ texts in order to highlight invariants and differences between them. Our aligner handles this task and it is general enough to handle others. The algorithm, based on the edit distance with moves, aligns duplicated character blocks with an A∗ heuristic algorithm. We present an experimental evaluation of our algorithm by comparing it with similar ones in four experiments. The first one deals with the alignment of texts with a large amount of repetitions; we show it is a very difficult problem. Two other experiments are duplicate linkage and text reuse detection. Finally, the algorithm is tested with synthetic data.",
	booktitle = "{IJCAI-2007} Workshop on Analytics for Noisy Unstructured Text Data, January 6-12",
	author = "Julien Bourdaillet and {Jean-Gabriel} Ganascia",
	year = "2007",
	keywords = "alignment, sequences, similarity"
}

@Article{ deleger_translating_2009,
	title = "Translating medical terminologies through word alignment in parallel text corpora",
	volume = "42",
	url = "http://portal.acm.org/citation.cfm?id=1563056.1563260\&coll=GUIDE\&dl=GUIDE\&CFID=76577594\&CFTOKEN=73477001",
	abstract = "Developing international multilingual terminologies is a time-consuming process. We present a methodology which aims to ease this process by automatically acquiring new translations of medical terms based on word alignment in parallel text corpora, and test it on English and French. After collecting a parallel, {English-French} corpus, we detected French translations of English terms from three {terminologies-MeSH,} {SNOMED} {CT} and the {MedlinePlus} Health Topics. We obtained respectively for each terminology 74.8\%, 77.8\% and 76.3\% of linguistically correct new translations. A sample of the {MeSH} translations was submitted to expert review and 61.5\% were deemed desirable additions to the French {MeSH.} In conclusion, we successfully obtained good quality new translations, which underlines the suitability of using alignment in text corpora to help translating terminologies. Our method may be applied to different European languages and provides a methodological framework that may be used with different processing tools.",
	number = "4",
	journal = "J. of Biomedical Informatics",
	author = "Louise Del{\'e}ger and Magnus Merkel and Pierre Zweigenbaum",
	year = "2009",
	keywords = "medical terminology, multilinguality, natural language processing, parallel corpora, word alignment",
	pages = "692--701"
}

@InCollection{ rousselot_terminologie_2002,
	edition = "Presses Universitaires de Caen",
	title = "Terminologie et Intelligence Artificielle",
	url = "http://u2.u-strasbg.fr/spiral/Equipe/Pfrath.html",
	abstract = "Les similitudes entre la construction de terminologies et la construction d’ontologies ont favoris{\'e} une r{\'e}flexion commune entre deux communaut{\'e}s de chercheurs pas toujours tr{\`e}s proches. Cette collaboration a {\'e}t{\'e} promue et d{\'e}velopp{\'e}e par un groupe national, Terminologie et Intelligence Artificielle {(TIA),} qui travaille depuis maintenant sept ans sur ce sujet. Nous faisons ici {\'e}tat des d{\'e}veloppements th{\'e}oriques et applicatifs issus de cette collaboration. Nous commen\c{c}ons par examiner les rapports entre la terminologie et la linguistique. Puis nous {\'e}voquons une branche florissante en ing{\'e}nierie des connaissances, celle qui a pour objet de construire des ontologies; nous examinons ensuite les proc{\'e}d{\'e}s d'extraction de terminologie {\`a} partir de corpus ainsi que les travaux qui visent {\`a} construire des bases de connaissances terminologiques.",
	booktitle = "Traits d'union",
	publisher = "G. Kleiber et N. Le Querler, dir.",
	author = "Fran\c{c}ois Rousselot and Pierre Frath",
	year = "2002",
	pages = "181--192"
}

@Article{ mustafa_character_2005,
	title = "Character contiguity in N-gram-based word matching: the case for Arabic text searching",
	volume = "41",
	issn = "0306-4573",
	shorttitle = "Character contiguity in N-gram-based word matching",
	url = "http://www.sciencedirect.com/science/article/B6VC8-4C0V3VT-3/2/6df3baa9938aced31f67c062a13107ea",
	doi = "10.1016/j.ipm.2004.02.003",
	abstract = "This work assesses the performance of two N-gram matching techniques for Arabic root-driven string searching: contiguous N-grams and hybrid N-grams, combining contiguous and non-contiguous. The two techniques were tested using three experiments involving different levels of textual word stemming, a textual corpus containing about 25 thousand words (with a total size of about {160KB),} and a set of 100 query textual words. The results of the hybrid approach showed significant performance improvement over the conventional contiguous approach, especially in the cases where stemming was used. The present results and the inconsistent findings of previous studies raise some questions regarding the efficiency of pure conventional N-gram matching and the ways in which it should be used in languages other than English.",
	number = "4",
	journal = "Information Processing \& Management",
	author = "Suleiman H. Mustafa",
	month = jul,
	year = "2005",
	keywords = "N-grams, Stemming, String matching, Text searching, Word conflation",
	pages = "819--827"
}

@Misc{ _jadt_074.pdf_????,
	title = "{JADT\_074.pdf} {(Objet} application/pdf)",
	url = "http://www.cavi.univ-paris3.fr/lexicometrica/jadt/jadt2004/pdf/JADT_074.pdf"
}

@InProceedings{ semmar_arabic_2007,
	address = "Prague, Czech Republic",
	title = "Arabic to French sentence alignment: exploration of a cross-language information retrieval approach",
	shorttitle = "Arabic to French sentence alignment",
	url = "http://portal.acm.org/citation.cfm?id=1654589\&dl=GUIDE\&coll=GUIDE\&CFID=73291750\&CFTOKEN=13220873",
	abstract = "Sentence alignment consists in estimating which sentence or sentences in the source language correspond with which sentence or sentences in a target language. We present in this paper a new approach to aligning sentences from a parallel corpus based on a cross-language information retrieval system. This approach consists in building a database of sentences of the target text and considering each sentence of the source text as a ``query'' to that database. The cross-language information retrieval system is a weighted Boolean search engine based on a deep linguistic analysis of the query and the documents to be indexed. This system is composed of a multilingual linguistic analyzer, a statistical analyzer, a reformulator, a comparator and a search engine. The multilingual linguistic analyzer includes a morphological analyzer, a part-of-speech tagger and a syntactic analyzer. The linguistic analyzer processes both documents to be indexed and queries to produce a set of normalized lemmas, a set of named entities and a set of nominal compounds with their morpho-syntactic tags. The statistical analyzer computes for documents to be indexed concept weights based on concept database frequencies. The comparator computes intersections between queries and documents and provides a relevance weight for each intersection. Before this comparison, the reformulator expands queries during the search. The expansion is used to infer from the original query words other words expressing the same concepts. The search engine retrieves the ranked, relevant documents from the indexes according to the corresponding reformulated query and then merges the results obtained for each language, taking into account the original words of the query and their weights in order to score the documents. The sentence aligner has been evaluated on the {MD} corpus of the {ARCADE} {II} project which is composed of news articles from the French newspaper {``Le} Monde Diplomatique''. The part of the corpus used in evaluation consists of the same subset of sentences in Arabic and French. Arabic sentences are aligned to their French counterparts. Results showed that alignment has correct precision and recall even when the corpus is not completely parallel (changes in sentence order or missing sentences).",
	booktitle = "Proceedings of the 2007 Workshop on Computational Approaches to Semitic Languages: Common Issues and Resources",
	publisher = "Association for Computational Linguistics",
	author = "Nasredine Semmar and Christian Fluhr",
	year = "2007",
	pages = "73--80"
}

@InProceedings{ cardey_modepour_2009,
	address = "Universit{\'e} de Technologie de Troyes, 27 \& 28 Janvier 2009",
	title = "Mod{\`e}le pour une Traduction Automatique fid{\`e}le : Le syst{\`e}me {TACTmultilingue,} Projet {LiSe} {(Linguistique} et S{\'e}curit{\'e})",
	booktitle = "actes du Workshop Interdisciplinaire sur la S{\'e}curit{\'e} Globale",
	author = "Sylviane {CARDEY} and Raksi {ANANTALAPOCHAI} and Mohand {BEDDAR} and Dilber {DEVITRE} and Peter {GREENFIELD} and Gan {JIN} and Laurent {SPAGGIARI} and Dominique {VUITTON}",
	year = "2009"
}

@InProceedings{ nakamura-delloye_methodes_2007,
	address = "Toulouse, France",
	title = "M{\'e}thodes d’alignement des propositions : un d{\'e}fi aux traductions crois{\'e}es",
	abstract = "Le pr{\'e}sent article d{\'e}crit deux m{\'e}thodes d’alignement des propositions : l’une bas{\'e}e sur les m{\'e}thodes d’appariement des graphes et une autre inspir{\'e}e de la classification ascendante hi{\'e}rarchique {(CAH).} Les deux m{\'e}thodes sont caract{\'e}ris{\'e}es par leur capacit{\'e} d’alignement des traductions crois{\'e}es, ce qui {\'e}tait impossible pour beaucoup de m{\'e}thodes classiques d’alignement des phrases. Contrairement aux r{\'e}sultats obtenus avec l’approche spectrale qui nous paraissent non satisfaisants, l’alignement bas{\'e} sur la m{\'e}thode de classification ascendante hi{\'e}rarchique est prometteur dans la mesure o{\`u} cette technique supporte bien les traductions crois{\'e}es.",
	booktitle = "Actes de la 14{\`e}me conf{\'e}rence annuelle sur le Traitement Automatique des Langues Naturelles, 12-15 juin",
	author = "Yayoi {Nakamura-Delloye}",
	year = "2007",
	keywords = "alignement des corpus parall{\`e}les, appariement de graphes, classification ascendante hi{\'e}rarchique, linguistique contrastive, m{\'e}moire de traduction, proposition syntaxique, traductions crois{\'e}es"
}

@InProceedings{ giguet_multilingual_2005,
	address = "Chiang Rai, Thaïland",
	title = "Multilingual Lexical Database Generation from parallel texts with endogenous resources",
	abstract = "This paper deals with multilingual database generation from parallel corpora. The idea is to contribute to the enrichment of lexical databases for languages with few linguistic resources. Our approach is endogenous: it relies on the raw texts only, it does not require external linguistic resources such as stemmers or taggers. The system produces alignments for the 20 European languages of the {‘Acquis} Communautaire’ Corpus.",
	booktitle = "{PAPILLON-2005} Workshop on Multilingual Lexical Databases",
	author = "Emmanuel Giguet and {Pierre-Sylvain} Luquet",
	year = "2005",
	keywords = "cosinus (cosine)"
}

@InProceedings{ cromieres_sub-sentential_2006,
	address = "Sydney, Australia",
	title = "Sub-sentential alignment using substring co-occurrence counts",
	url = "http://portal.acm.org/citation.cfm?id=1557860",
	abstract = "In this paper, we will present an efficient method to compute the co-occurrence counts of any pair of substring in a parallel corpus, and an algorithm that make use of these counts to create sub-sentential alignments on such a corpus. This algorithm has the advantage of being as general as possible regarding the segmentation of text.",
	booktitle = "Proceedings of the 21st International Conference on computational Linguistics and 44th Annual Meeting of the Association for Computational Linguistics: Student Research Workshop",
	publisher = "Association for Computational Linguistics",
	author = "Fabien Cromieres",
	year = "2006",
	pages = "13--18",
	annote = "{{\textless}p{\textgreater}Ali} pr{\'e}alable en phrases{\textless}br {/{\textgreater}Mais} pas d'utilisation d'outils de segmentation ou de ressources ling{\textless}br /{\textgreater}{\textless}br {/{\textgreater}On} utilise les meilleures corr{\'e}lations d'abord, puis on interdit celles d'apr{\`e}s de les utiliser.{\textless}br {/{\textgreater}Il} esiste un grain sans discontinuit{\'e} entre les {\'e}l{\'e}ments (ne .... pas) {\textless}br /{\textgreater}{\textless}br {/{\textgreater}N-grammes} de caract{\`e}res =\&gt; sch{\'e}ma 0-1 = Alignement partiel autoris{\'e}{\textless}br /{\textgreater}{\textless}br /{\textgreater}lien coeff de corr{\'e}lation et volume des donn{\'e}es{\textless}br /{\textgreater}p{\'e}nalit{\'e} sur les alignements des {\'e}l{\'e}ments les plus courts{\textless}/p{\textgreater}"
}

@Misc{ _clustering_????,
	title = "Clustering"
}

@Misc{ _stucglos.pdf_????,
	title = "{STUCGLOS.pdf} {(Objet} application/pdf)",
	url = "http://www.cavi.univ-paris3.fr/lexicometrica/livre/st94/STUCGLOS.pdf"
}

@InCollection{ lepage_measure_2009,
	title = "A Measure of the Number of True Analogies between Chunks in Japanese",
	isbn = "978-3-642-04234-8",
	url = "http://portal.acm.org/citation.cfm?id=1616962",
	abstract = "This study relates to the assessment of the argument of the poverty of the stimulus in that we conducted a measure of the number of true proportional analogies between chunks in a language with case markers, Japanese. On a bicorpus of 20,000 sentences, we show that at least 96\% of the analogies of form between chunks are also analogies of meaning, thus reporting the presence of at least two million true analogies between chunks in this corpus. As the number of analogies between chunks overwhelmingly surpasses the number of analogies between sentences by three orders of magnitude for this size of corpora, we conclude that proportional analogy is an efficient and undeniable structuring device between Japanese chunks.",
	booktitle = "Human Language Technology. Challenges of the Information Society: Third Language and Technology Conference, {LTC} 2007, Poznan, Poland, October 5-7, 2007, Revised Selected Papers",
	publisher = "{Springer-Verlag}",
	author = "Yves Lepage and Julien Migeot and Erwan Guillerm",
	year = "2009",
	keywords = "chunks, japanese language, structure of language, true analogies",
	pages = "154--164"
}

@InCollection{ qin_supporting_2003,
	title = "Supporting Multilingual Information Retrieval in Web Applications: An {English-Chinese} Web Portal Experiment",
	shorttitle = "Supporting Multilingual Information Retrieval in Web Applications",
	url = "http://www.springerlink.com/content/xj7qaxeqdtmk2j8d",
	abstract = "Cross-language information retrieval {(CLIR)} and multilingual information retrieval {(MLIR)} techniques have been widely studied, but they are not often applied to and evaluated for Web applications. In this paper, we present our research in developing and evaluating a multilingual {English-Chinese} Web portal in the business domain. A dictionary-based approach has been adopted that combines phrasal translation, co-occurrence analysis, and pre- and post-translation query expansion. The approach was evaluated by domain experts and the results showed that co-occurrence-based phrasal translation achieved a 74.6\% improvement in precision when compared with simple word-by-word translation.",
	booktitle = "Digital Libraries: Technology and Management of Indigenous Knowledge for Global Access",
	author = "Jialun Qin and Yilu Zhou and Michael Chau and Hsinchun Chen",
	year = "2003",
	pages = "149--152"
}

@Article{ kay_text-translation_1993,
	title = "Text-translation alignment",
	volume = "19",
	url = "http://portal.acm.org/citation.cfm?id=972450.972457\&coll=GUIDE\&dl=GUIDE\&CFID=76577594\&CFTOKEN=73477001",
	abstract = "We present an algorithm for aligning texts with their translations that is based only on internal evidence. The relaxation process rests on a notion of which word in one text corresponds to which word in the other text that is essentially based on the similarity of their distributions. It exploits a partial alignment of the word level to induce a maximum likelihood alignment of the sentence level, which is in turn used, in the next iteration, to refine the word level estimate. The algorithm appears to converge to the correct sentence alignment in only a few iterations.",
	number = "1",
	journal = "Comput. Linguist.",
	author = "Martin Kay and Martin R{\"o}scheisen",
	year = "1993",
	keywords = "distribution, internal evidence, similarity",
	pages = "121--142"
}

@Article{ lepage_purest_2005,
	title = "Purest ever example-based machine translation: Detailed presentation and assessment",
	volume = "19",
	shorttitle = "Purest ever example-based machine translation",
	url = "http://portal.acm.org/citation.cfm?id=1227596",
	abstract = "We have designed, implemented and assessed an {EBMT} system that can be dubbed the ``purest ever built'': it strictly does not make any use of variables, templates or patterns, does not have any explicit transfer component, and does not require any preprocessing or training of the aligned examples. It uses only a specific operation, proportional analogy, that implicitly neutralizes divergences between languages and captures lexical and syntactic variations along the paradigmatic and syntagmatic axes without explicitly decomposing sentences into fragments. Exactly the same genuine implementation of such a core engine was evaluated on different tasks and language pairs. To begin with, we compared our system on two tasks of a previous {MT} evaluation campaign to rank it among other current state-of-the-art systems. Then, we illustrated the ``universality'' of our system by participating in a recent {MT} evaluation campaign, with exactly the same core engine, for a wide variety of language pairs. Finally, we studied the influence of extra data like dictionaries and paraphrases on the system performance.",
	number = "3-4",
	journal = "Machine Translation",
	author = "Yves Lepage and Etienne Denoual",
	year = "2005",
	keywords = "divergences across languages, example-based machine translation, proportional analogies",
	pages = "251--282"
}

@TechReport{ muller_acquisition_1997,
	address = "{INIST-CNRS,} Cote {INIST} : 14802 E",
	title = "Acquisition et structuration des connaissances en corpus : {\'e}l{\'e}ments m{\'e}thodologiques",
	shorttitle = "Acquisition et structuration des connaissances en corpus",
	url = "http://hal.archives-ouvertes.fr/inria-00073491/en/",
	abstract = "Ce document pr{\'e}sente une exp{\'e}rimentation men{\'e}e dans le domaine de l'agriculture. Les travaux ont {\'e}t{\'e} men{\'e}s dans le cadre du projet {ILC} sur l'analyse de l'information. L'objectif de cette exp{\'e}rimentation est de montrer comment l'exploitation de modules automatiques de traitement de la langue bas{\'e}s sur la terminologie peuvent {\{\\^e}}tre combin{\'e}s avec des modules de classication pour faire {\'e}merger de corpus volumineux de textes, des classes de termes. Ces classes sont interpr{\'e}tables et instancient des mod{\`e}les abstraits de connaissance du domaine de sp{\'e}cialit{\'e} que nous avons retrouv{\'e}s manuellement. Nous avons trait{\'e} un corpus de 1386 r{\'e}sum{\'e}s de notices bibliographiques en anglais. La cha{\{\\^i}}ne linguistique op{\`e}re {\'e}galement sur le fran\c{c}ais.",
	number = "{INRIA-RR} - 3198",
	institution = "{INRIA} Lorraine - Equipe : {DIALOGUE}",
	author = "Chantal Muller and Xavier Polanco and Jean Royaut{\'e} and Yannick Toussaint",
	month = jun,
	year = "1997",
	note = "Ce document pr{\'e}sente une exp{\'e}rimentation r{\'e}alis{\'e}e dans le domaine de l'agriculture. Les travaux ont {\'e}t{\'e} men{\'e}s dans le cadre du projet {ILC} sur l'analyse de l'information. L'objectif de cette exp{\'e}rimentation est de montrer comment l'exploitation de modules automatiques de traitement de la langue bas{\'e}s sur la terminologie peuvent {\{\\^e}}tre combin{\'e}s avec des modules de classification pour faire {\'e}merger de corpus volumineux de textes, des classes de termes. Ces classes sont interpr{\'e}tables et instancient des mod{\`e}les abstraits de connaissance du domaine de sp{\'e}cialit{\'e} que nous avons retrouv{\'e}s manuellement. Nous avons trait{\'e} un corpus de 1386 r{\'e}sum{\'e}s de notices bibliographiques en anglais. La cha{\{\\^i}}ne linguistique op{\`e}re {\'e}galement sur le fran\c{c}ais.",
	pages = "48 pages"
}

@InProceedings{ gerdes_lalignement_2008,
	address = "Lyon, France",
	title = "L’alignement pour les pauvres : Adapter la bonne m{\'e}trique pour un algorithme dynamique de dilatation temporelle pour l’alignement sans ressources de corpus bilingues",
	abstract = "Les corpus bilingues align{\'e}s sont essentiels dans l’{\'e}laboration de ressources bilingues comme dans tout travail traductologique, mais l’alignement n{\'e}cessite en lui-m{\{\\^e}}me des ressources bilingues ou d’importantes interventions de locuteurs bilingues. Cet article d{\'e}crit un travail en cours sur l’alignement de textes bilingues {\`a} l’aide d’un algorithme dynamique de dilatation temporelle. De tels algorithmes sont les seuls {\`a} fonctionner sans aucune ressource bilingue et sans se baser sur des similarit{\'e}s entre langue source et cible (cognats lexicaux ou de ponctuation) : seul le signal de chaque mot dans le corpus {\`a} aligner est compar{\'e} aux signaux des mots de la langue cible. Nous montrons que le choix de la bonne m{\'e}trique utilis{\'e}e dans cette comparaison est primordial pour l’utilit{\'e} des r{\'e}sultats. Il est possible d’am{\'e}liorer les r{\'e}sultats de deux mani{\`e}res : en {\'e}largissant les segments pour inclure des mots compos{\'e}s et d’autres collocations lin{\'e}aires, et en incluant des mots similaires (cognats intra-langues d{\'e}gag{\'e}s {\`a} la Levenshtein) dans les couples de mots bilingues pour capter les flexions des mots. Les points d’ancrage ainsi d{\'e}gag{\'e}s, en fonction de la similarit{\'e} des signaux, servent {\`a} aligner les textes des deux langues. L’approche donne des r{\'e}sultats satisfaisants m{\{\\^e}}me pour des couples de langues comme le fran\c{c}ais (flexion riche) et le chinois (langue isolante, {\'e}criture sans espacement). Pour une accessibilit{\'e} maximale, l’impl{\'e}mentation, {\'e}crite en Python, C et Javascript, tourne, en version pr{\'e}liminaire, sur un serveur web.",
	booktitle = "9es Journ{\'e}es internationales {d’Analyse} statistique des Donn{\'e}es Textuelles",
	author = "Kim Gerdes",
	year = "2008"
}

@InProceedings{ deville_generation_2004,
	address = "{Louvain-la-Neuve,} Belgique",
	title = "G{\'e}n{\'e}ration de corpus multilingues dans la mise en oeuvre d'un outil en ligne d'aide {\`a} la lecture de textes en langue {\'e}trang{\`e}re",
	url = "http://www.cavi.univ-paris3.fr/lexicometrica/jadt/textometrie-multilingue/page5.html",
	abstract = "This paper presents a method for the automatic generation of aligned bilingual corpora in a Web-based reading tool for Dutch texts by French speaking learners {(NEDERLEX).} The authors first discuss the major functions of {NEDERLEX.} Then they describe the role of bilingual corpora in the design and construction of the {NEDERLEX} tool, as well as the approach adopted for the extraction and alignment of such corpora. A demo of the {NEDERLEX} prototype will be presented during the conference talk.",
	booktitle = "In Actes des {JADT'04}",
	author = "Guy Deville and Laurence Dumortier and Hans Paulussen",
	year = "2004",
	keywords = "alignement de corpus multilingues, logiciel en ligne d’apprentissage des langues {\'e}trang{\`e}res",
	pages = "304--312"
}

@InProceedings{ moore_improved_2006,
	address = "Sydney, Australia",
	title = "Improved discriminative bilingual word alignment",
	url = "http://portal.acm.org/citation.cfm?id=1220240",
	abstract = "For many years, statistical machine translation relied on generative models to provide bilingual word alignments. In 2005, several independent efforts showed that discriminative models could be used to enhance or replace the standard generative approach. Building on this work, we demonstrate substantial improvement in word-alignment accuracy, partly though improved training methods, but predominantly through selection of more and better features. Our best model produces the lowest alignment error rate yet reported on Canadian Hansards bilingual data.",
	booktitle = "Proceedings of the 21st International Conference on Computational Linguistics and the 44th annual meeting of the Association for Computational Linguistics",
	publisher = "Association for Computational Linguistics",
	author = "Robert C. Moore and Wen-tau Yih and Andreas Bode",
	year = "2006",
	pages = "513--520"
}

@Article{ veronis_arcade_1999,
	title = "{ARCADE} : Evaluation de syst{\`e}mes d'alignement de textes multilingues",
	volume = "4",
	number = "1",
	journal = "Lettre de {l'ELRA}",
	author = "Jean V{\'e}ronis and Philippe Langlais",
	year = "1999"
}

@InProceedings{ alegria_named_2006,
	address = "Trento, Italie",
	title = "Named Entities Translation Based on Comparable Corpora",
	url = "http://ixa.si.ehu.es/Ixa/Argitalpenak/Artikuluak/1139581750/publikoak/NET-based-on-Comparable-Corpora",
	abstract = "In this paper we present a system for translating named entities from Basque to Spanish based on comparable corpora. For that purpose we have tried two approaches: one based on Basque linguistic features, and a language-independent tool. For both tools we have used Basque- Spanish comparable corpora, a bilingual dictionary and the web as resources.",
	booktitle = "Proceedings of the Workshop on {Multi-Word-Expressions} in a Multilingual Context at {EACL06}",
	author = "I{\~n}aki Alegria and Nerea Ezeiza and Izaskun Fernandez",
	month = apr,
	year = "2006",
	keywords = "wikipedia"
}

@InProceedings{ giguet_multi-grained_2005,
	address = "Borovets, Bulgaria",
	title = "Multi-grained alignment of parallel texts with endogenous resources",
	abstract = "This paper deals with the spotting of multi-grained translation equivalents from parallel corpora. The idea is to contribute to the processing of languages for which few linguistic resources are available. We especially pay attention to the handling of highly inflectional languages. Our approach is endogenous: it does not require external linguistic resources such as stemmers or taggers.",
	booktitle = "In Proceedings of the Recent Advances in Natural Language Processing {(RANLP)} International Workshop {``New} Trends in Machine Translations''.",
	author = "Emmanuel Giguet",
	year = "2005",
	pages = "12--17"
}

@InProceedings{ groves_hybrid_2005,
	address = "Ann Arbor, Michigan",
	title = "Hybrid example-based {SMT:} the best of both worlds?",
	shorttitle = "Hybrid example-based {SMT}",
	url = "http://portal.acm.org/citation.cfm?id=1654449.1654490\&coll=\&dl=ACM\&type=series\&idx=SERIES12862\&part=series\&WantType=Proceedings\&title=ACL%20Workshops",
	abstract = "{(Way} and Gough, 2005) provide an in-depth comparison of their {Example-Based} Machine Translation {(EBMT)} system with a Statistical Machine Translation {(SMT)} system constructed from freely available tools. According to a wide variety of automatic evaluation metrics, they demonstrated that their {EBMT} system outperformed the {SMT} system by a factor of two to one.",
	booktitle = "Proceedings of the {ACL} Workshop on Building and Using Parallel Texts",
	publisher = "Association for Computational Linguistics",
	author = "Declan Groves and Andy Way",
	year = "2005",
	pages = "183--190"
}

@Article{ damashek_gauging_1995,
	title = "Gauging Similarity with {n-Grams:} {Language-Independent} Categorization of Text",
	volume = "267",
	shorttitle = "Gauging Similarity with {n-Grams}",
	url = "http://www.sciencemag.org/cgi/content/abstract/267/5199/843",
	doi = "10.1126/science.267.5199.843",
	abstract = "A language-independent means of gauging topical similarity in unrestricted text is described. The method combines information derived from n-grams (consecutive sequences of n characters) with a simple vector-space technique that makes sorting, categorization, and retrieval feasible in a large multilingual collection of documents. No prior information about document content or language is required. Context, as it applies to document similarity, can be accommodated by a well-defined procedure. When an existing document is used as an exemplar, the completeness and accuracy with which topically related documents are retrieved is comparable to that of the best existing systems. The results of a formal evaluation are discussed, and examples are given using documents in English and Japanese.",
	journal = "Science",
	author = "Marc Damashek",
	month = feb,
	year = "1995",
	pages = "843--848"
}

@Article{ gale_program_1993,
	title = "A program for aligning sentences in bilingual corpora",
	volume = "19",
	url = "http://portal.acm.org/citation.cfm?id=972450.972455\&coll=GUIDE\&dl=GUIDE\&CFID=76577594\&CFTOKEN=73477001",
	abstract = "Researchers in both machine translation (e.g., Brown et al. 1990) and bilingual lexicography (e.g., Klavans and Tzoukermann 1990) have recently become interested in studying bilingual corpora, bodies of text such as the Canadian Hansards (parliamentary proceedings), which are available in multiple languages (such as French and English). One useful step is to align the sentences, that is, to identify correspondences between sentences in one language and sentences in the other {language.This} paper will describe a method and a program (align) for aligning sentences based on a simple statistical model of character lengths. The program uses the fact that longer sentences in one language tend to be translated into longer sentences in the other language, and that shorter sentences tend to be translated into shorter sentences. A probabilistic score is assigned to each proposed correspondence of sentences, based on the scaled difference of lengths of the two sentences (in characters) and the variance of this difference. This probabilistic score is used in a dynamic programming framework to find the maximum likelihood alignment of {sentences.It} is remarkable that such a simple approach works as well as it does. An evaluation was performed based on a trilingual corpus of economic reports issued by the Union Bank of Switzerland {(UBS)} in English, French, and German. The method correctly aligned all but 4\% of the sentences. Moreover, it is possible to extract a large subcorpus that has a much smaller error rate. By selecting the best-scoring 80\% of the alignments, the error rate is reduced from 4\% to 0.7\%. There were more errors on the {English-French} subcorpus than on the {English-German} subcorpus, showing that error rates will depend on the corpus considered; however, both were small enough to hope that the method will be useful for many language {pairs.To} further research on bilingual corpora, a much larger sample of Canadian Hansards (approximately 90 million words, half in English and and half in French) has been aligned with the align program and will be available through the Data Collection Initiative of the Association for Computational Linguistics {(ACL/DCI).} In addition, in order to facilitate replication of the align program, an appendix is provided with detailed c-code of the more difficult core of the align program.",
	number = "1",
	journal = "Comput. Linguist.",
	author = "William A. Gale and Kenneth W. Church",
	year = "1993",
	pages = "75--102"
}

@InProceedings{ crego_plusieurs_2009,
	address = "Senlis, France",
	title = "Plusieurs langues (bien choisies) valent mieux qu’une : traduction statistique multi-source par renforcement lexical",
	abstract = "Les syst{\`e}mes de traduction statistiques int{\`e}grent diff{\'e}rents types de mod{\`e}les dont les pr{\'e}dictions sont combin{\'e}es, lors du d{\'e}codage, afin de produire les meilleures traductions possibles. Traduire correctement des mots polys{\'e}miques, comme, par exemple, le mot avocat du fran\c{c}ais vers l’anglais (lawyer ou avocado), requiert l’utilisation de mod{\`e}les suppl{\'e}mentaires, dont l’estimation et l’int{\'e}gration s’av{\`e}rent complexes. Une alternative consiste {\`a} tirer parti de l’observation selon laquelle les ambiguït{\'e}s li{\'e}es {\`a} la polys{\'e}mie ne sont pas les m{\{\\^e}}mes selon les langues source consid{\'e}r{\'e}es. Si l’on dispose, par exemple, d’une traduction vers l’espagnol dans laquelle avocat a {\'e}t{\'e} traduit par aguacate, alors la traduction de ce mot vers l’anglais n’est plus ambigu{\"e}. Ainsi, la connaissance d’une traduction fran\c{c}ais!espagnol permet de renforcer la s{\'e}lection de la traduction avocado pour le syst{\`e}me fran\c{c}ais!anglais. Dans cet article, nous proposons d’utiliser des documents en plusieurs langues pour renforcer les choix lexicaux effectu{\'e}s par un syst{\`e}me de traduction automatique. En particulier, nous montrons une am{\'e}lioration des performances sur plusieurs m{\'e}triques lorsque les traductions auxiliaires utilis{\'e}es sont obtenues manuellement.",
	booktitle = "Actes de la 16{\`e}me conf{\'e}rence annuelle sur le Traitement Automatique des Langues Naturelles, 24-26 juin",
	author = "Adrien Crego and Aur{\'e}lien Max and Fran\c{c}ois Yvon",
	year = "2009",
	keywords = "d{\'e}sambiguïsation lexicale, r{\'e}{\'e}valuation de listes d’hypoth{\`e}ses, Traduction automatique statistique"
}

@InProceedings{ moore_fast_2002,
	title = "Fast and Accurate Sentence Alignment of Bilingual Corpora",
	isbn = "3-540-44282-0",
	url = "http://portal.acm.org/citation.cfm?id=749407",
	booktitle = "Proceedings of the 5th Conference of the Association for Machine Translation in the Americas on Machine Translation: From Research to Real Users",
	publisher = "{Springer-Verlag}",
	author = "Robert C. Moore",
	year = "2002",
	pages = "135--144"
}

@InProceedings{ majumder_n-gram_2002,
	address = "Goa, India",
	title = "N-gram : a language independent approach to {IR} and {NLP}",
	shorttitle = "n-gram",
	url = "http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.126.8275",
	booktitle = "Proceedings of the International Conference on Universal Knowledge and Language, 25-29 novembre",
	author = "Prasenjit Majumder",
	year = "2002"
}

@InProceedings{ wu_aligning_1994,
	address = "Las Cruces, New Mexico",
	title = "Aligning a parallel {English-Chinese} corpus statistically with lexical criteria",
	url = "http://portal.acm.org/citation.cfm?id=981732.981744\&coll=GUIDE\&dl=GUIDE\&CFID=76577594\&CFTOKEN=73477001",
	abstract = "We describe our experience with automatic alignment of sentences in parallel {English-Chinese} texts. Our report concerns three related topics: (1) progress on the {HKUST} {English-Chinese} Parallel Bilingual Corpus; (2) experiments addressing the applicability of Gale \& Church's (1991) length-based statistical method to the task of alignment involving a {non-Indo-European} language; and (3) an improved statistical method that also incorporates domain-specific lexical cues.",
	booktitle = "Proceedings of the 32nd annual meeting on Association for Computational Linguistics",
	publisher = "Association for Computational Linguistics",
	author = "Dekai Wu",
	year = "1994",
	pages = "80--87"
}

@Misc{ _index_????,
	title = "Index par mots-cl{\'e}s",
	url = "http://www.atala.org/Index-par-mots-cles",
	howpublished = "{http://www.atala.org/Index-par-mots-cles}"
}

@Misc{ veronis_etude_2006,
	title = "Etude comparative de six moteurs de recherche",
	url = "http://sites.univ-provence.fr/veronis/pdf/2006-etude-comparative.pdf",
	author = "Jean V{\'e}ronis",
	year = "2006",
	note = "{VER} 06"
}

@InProceedings{ church_stochastic_1988,
	address = "Austin, Texas",
	title = "A stochastic parts program and noun phrase parser for unrestricted text",
	url = "http://portal.acm.org/citation.cfm?id=974235.974260\&coll=Portal\&dl=GUIDE\&CFID=76577594\&CFTOKEN=73477001",
	abstract = "Note: {OCR} errors may be found in this Reference List extracted from the full text article. {ACM} has opted to expose the complete List rather than only correct and linked references.",
	booktitle = "Proceedings of the second conference on Applied natural language processing",
	publisher = "Association for Computational Linguistics",
	author = "Kenneth Ward Church",
	year = "1988",
	pages = "136--143"
}

@Article{ patterson_why_2004,
	title = "Why Writing Your Own Search Engine Is Hard",
	volume = "2",
	url = "http://portal.acm.org/ft_gateway.cfm?id=988407\&type=html\&coll=GUIDE\&dl=GUIDE\&CFID=93741085\&CFTOKEN=74471388",
	doi = "10.1145/988392.988407",
	abstract = "There must be 4,000 programmers typing away in their basements trying to build the next ``world's most scalable'' search engine. It has been done only a few times. It has never been done by a big group; always one to four people did the core work, and the big team came on to build the elaborations and the production infrastructure. Why is it so hard? We are going to delve a bit into the various issues to consider when writing a search engine. This article is aimed at those individuals or small groups that are considering this endeavor for their Web site or intranet. It is fun, but a word of caution: not only is it difficult, but you need two commodities in short supplytime and patience.",
	number = "2",
	journal = "Queue",
	author = "Anna Patterson",
	year = "2004",
	note = "{PAT} 04",
	pages = "48--53"
}

@InProceedings{ giguet_alignement_2005,
	address = "Lorient, France",
	title = "Alignement d'unit{\'e}s textuelles de taille variable",
	url = "http://hal.archives-ouvertes.fr/index.php?halsid=50le6pgjvcg7ral86p9i2qt010\&view_this_doc=halshs-00202140\&version=1",
	author = "Emmanuel Giguet and Marianna Apidianaki",
	year = "2005",
	note = "L'approche propos{\'e}e dans cet article est une approche endog{\`e}ne d'alignement sous-phrastique, qui concerne le rep{\'e}rage d'{\'e}quivalents de traduction de taille variable, et notamment sup{\'e}rieure {\`a} un mot."
}

@Book{ anderson_long_2008,
	edition = "Rev Upd",
	title = "The Long Tail: Why the Future of Business Is Selling Less of More",
	isbn = "1401309666",
	shorttitle = "The Long Tail",
	publisher = "Hyperion Books",
	author = "Chris Anderson",
	month = jul,
	year = "2008",
	note = "{AND} 08"
}

@Article{ levenshtein_binary_1966,
	title = "Binary codes capable of correcting deletions insertions and reversals",
	volume = "10",
	journal = "Soviet Physics Doklady",
	author = "Vladimir Levenshtein",
	year = "1966",
	note = "{LEV} 66",
	pages = "707--710"
}

@Article{ lerman_comparaison_2008,
	title = "Comparaison de deux crit{\`e}res en classification ascendante hi{\'e}rarchique sous contrainte de contiguït{\'e}. application en imagerie num{\'e}rique",
	volume = "tome 149",
	abstract = "Nous analysons une algorithmique de classification ascendante hi{\'e}rarchique sous contrainte de contiguït{\'e} par agr{\'e}gation des voisins r{\'e}ciproques en la situant dans le contexte g{\'e}n{\'e}ral des algorithmes rapides de classification ascendante hi{\'e}rarchique. Surtout, nous la d{\'e}clinons selon deux types de crit{\`e}res. Il s'agit d'une part, du crit{\`e}re de Ward de la variation de l'inertie expliqu{\'e}e et d'autre part, d'une famille param{\'e}tr{\'e}e du crit{\`e}re {VL} de la vraisemblance du lien maximal. Le contexte applicatif est celui de la segmentation d'image. On souligne la nature lin{\'e}aire de la complexit{\'e} algorithmique que nous montrons exp{\'e}rimentalement. L'influence algorithmique de la notion de contiguït{\'e} retenue est mise en {\'e}vidence. Une nouvelle strat{\'e}gie mettant en oeuvre l'agr{\'e}gation multiple dans la formation des classes montre tout son int{\'e}r{\{\\^e}}t. On {\'e}tudie aussi bien sur le plan th{\'e}orique qu'exp{\'e}rimental la possibilit{\'e} d'inversions compte tenu du type de crit{\`e}re utilis{\'e}. Nous terminons en proposant une analyse comparative des r{\'e}sultats sur des donn{\'e}es r{\'e}elles en imagerie satellitaire.",
	number = "n°2",
	journal = "Journal de la Soci{\'e}t{\'e} Fran\c{c}aise de Statistique \& Revue de statistique appliqu{\'e}e",
	author = "{Isra{\"e}l-C{\'e}sar} Lerman and Kaddour Bachar",
	year = "2008",
	pages = "pp. 45--74"
}

@InProceedings{ maurel_les_2004,
	title = "Les mots inconnus sont-ils des noms propres ?",
	booktitle = "Proceedings of {JADT}",
	author = "Denis Maurel",
	year = "2004",
	note = "{MAU} 04"
}

@Misc{ _atala:_????,
	title = "{ATALA:} Lexique Bilingue",
	url = "http://atala.biomath.jussieu.fr/glossaire.html",
	howpublished = "http://atala.biomath.jussieu.fr/glossaire.html"
}

@InCollection{ patry_automatic_2005,
	title = "Automatic Identification of Parallel Documents With Light or Without Linguistic Resources",
	url = "http://www-etud.iro.umontreal.ca/~patryale/papers/patry_langlais_2005_ai.pdf",
	abstract = "Parallel corpora are playing a crucial role in multilingual natural language processing. Unfortunately, the availability of such a resource is the bottleneck in most applications of interest. Mining the web for parallel corpora is a viable solution that comes at a price: it is not always easy to identify parallel documents among the crawled material. In this study we address the problem of automatically identifying the pairs of texts that are translation of each other in a set of documents. We show that it is possible to automatically build particularly efficient content-based methods that make use of very little lexical knowledge. We also evaluate our approach toward a front-end translation task and demonstrate that our parallel text classifier yields better performances than another approach based on a rich lexicon.",
	booktitle = "Advances in Artificial Intelligence",
	author = "Alexandre Patry and Philippe Langlais",
	year = "2005",
	keywords = "corpus parall{\`e}le, identification",
	pages = "354--365"
}

@InProceedings{ lardilleux_lalignement_2009,
	address = "Avignon, France",
	title = "L’alignement sous-phrastique multilingue pour les nuls",
	booktitle = "7e Manifestation des Jeunes Chercheurs en Sciences et Technologies de {l'Information} et de la Communication, 16-18 novembre",
	author = "Adrien Lardilleux",
	year = "2009"
}

@InProceedings{ perrot_outillage_2009,
	address = "{LIP6} - Universit{\'e} Pierre et Marie Curie \& {ERTIM} - {INaLCO,} 7 octobre 2009",
	title = "Outillage informatique pour la pratique du plurilinguisme",
	author = "{Jean-Fran\c{c}ois} Perrot",
	year = "2009"
}

@InCollection{ veronis_evaluation_2000,
	address = "Dordrecht",
	edition = "Kluwer Academic Publishers",
	title = "Evaluation of parallel text alignment systems: the {ARCADE} project.",
	booktitle = "Parallel text processing: Alignment and use of translation corpora",
	publisher = "J. V{\'e}ronis",
	author = "Jean V{\'e}ronis and Philippe Langlais",
	year = "2000",
	pages = "369--388"
}

@Article{ broder_taxonomy_2002,
	title = "A taxonomy of web search",
	volume = "36",
	url = "http://portal.acm.org/citation.cfm?id=792552",
	doi = "10.1145/792550.792552",
	abstract = "Classic {IR} (information retrieval) is inherently predicated on users searching for information, the so-called ``information need''. But the need behind a web search is often not informational -- it might be navigational (give me the url of the site I want to reach) or transactional (show me sites where I can perform a certain transaction, e.g. shop, download a file, or find a map). We explore this taxonomy of web searches and discuss how global search engines evolved to deal with web-specific needs.",
	number = "2",
	journal = "{SIGIR} Forum",
	author = "Andrei Broder",
	year = "2002",
	note = "{BRO} 02",
	pages = "3--10"
}

@TechReport{ melamed_manual_1998,
	address = "Philadelphia",
	type = "Technical Report",
	title = "Manual Annotation of Translational Equivalence: The Blinker Project",
	shorttitle = "Manual Annotation of Translational Equivalence",
	url = "http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.53.3659",
	institution = "Institute for Research in Cognitive Science",
	author = "Dan Melamed",
	year = "1998",
	pages = "13 pages"
}

@PhDThesis{ lardilleux_contribution_2010,
	title = "Contribution des basses fr{\'e}quences {\`a} l'alignement sous-phrastique multilingue : une approche diff{\'e}rentielle",
	shorttitle = "Contribution des basses fr{\'e}quences {\`a} l'alignement sous-phrastique multilingue",
	url = "http://hal.archives-ouvertes.fr/index.php?halsid=rsgsimesspm32r8ug106nbpr03\&view_this_doc=tel-00520787\&version=1",
	abstract = "L'objectif de cette th{\`e}se est de montrer que, contrairement aux id{\'e}es re\c{c}ues, les mots de basses fr{\'e}quences peuvent {\{\\^e}}tre mis {\`a} profit de fa\c{c}on efficace en traitement automatique des langues. Nous les mettons {\`a} contribution en alignement sous-phrastique, t{\{\\^a}}che qui constitue la premi{\`e}re {\'e}tape de la plupart des syst{\`e}mes de traduction automatique fond{\'e}e sur les donn{\'e}es (traduction probabiliste ou par l'exemple). Nous montrons que les mots rares peuvent servir de fondement m{\{\\^e}}me dans la conception d'une m{\'e}thode d'alignement sous-phrastique multilingue, {\`a} l'aide de techniques diff{\'e}rentielles proches de celles utilis{\'e}es en traduction automatique par l'exemple. Cette m{\'e}thode est r{\'e}ellement multilingue, en ce sens qu'elle permet le traitement simultan{\'e} d'un nombre quelconque de langues. Elle est de surcro{\{\\^i}}t tr{\`e}s simple, anytime, et permet un passage {\`a} l'{\'e}chelle naturel. Nous comparons notre impl{\'e}mentation, Anymalign, {\`a} deux t{\'e}nors statistiques du domaine sur des t{\{\\^a}}ches bilingues. Bien qu'{\`a} l'heure actuelle ses r{\'e}sultats sont en moyenne l{\'e}g{\`e}rement en retrait par rapport {\`a} l'{\'e}tat de l'art en traduction automatique probabiliste par segments, nous montrons que la qualit{\'e} propre des lexiques produits par notre m{\'e}thode est en fait sup{\'e}rieure {\`a} celle de l'{\'e}tat de l'art.",
	author = "Adrien Lardilleux",
	month = sep,
	year = "2010",
	note = "L'objectif de cette th{\`e}se est de montrer que, contrairement aux id{\'e}es re\c{c}ues, les mots de basses fr{\'e}quences peuvent {\{\\^e}}tre mis {\`a} profit de fa\c{c}on efficace en traitement automatique des langues. Nous les mettons {\`a} contribution en alignement sous-phrastique, t{\{\\^a}}che qui constitue la premi{\`e}re {\'e}tape de la plupart des syst{\`e}mes de traduction automatique fond{\'e}e sur les donn{\'e}es (traduction probabiliste ou par l'exemple). Nous montrons que les mots rares peuvent servir de fondement m{\{\\^e}}me dans la conception d'une m{\'e}thode d'alignement sous-phrastique multilingue, {\`a} l'aide de techniques diff{\'e}rentielles proches de celles utilis{\'e}es en traduction automatique par l'exemple. Cette m{\'e}thode est r{\'e}ellement multilingue, en ce sens qu'elle permet le traitement simultan{\'e} d'un nombre quelconque de langues. Elle est de surcro{\{\\^i}}t tr{\`e}s simple, anytime, et permet un passage {\`a} l'{\'e}chelle naturel. Nous comparons notre impl{\'e}mentation, Anymalign, {\`a} deux t{\'e}nors statistiques du domaine sur des t{\{\\^a}}ches bilingues. Bien qu'{\`a} l'heure actuelle ses r{\'e}sultats sont en moyenne l{\'e}g{\`e}rement en retrait par rapport {\`a} l'{\'e}tat de l'art en traduction automatique probabiliste par segments, nous montrons que la qualit{\'e} propre des lexiques produits par notre m{\'e}thode est en fait sup{\'e}rieure {\`a} celle de l'{\'e}tat de l'art.",
	keywords = "traitement automatique des langues, hapax, multilinguisme, traduction automatique, alignement, {\'e}v{\'e}nements rares"
}

@Misc{ _sentence_????,
	title = "Sentence Alignment and Word Alignment: Projects, Papers, Evaluation, etc.",
	url = "http://www.cse.unt.edu/~rada/wa/",
	howpublished = "http://www.cse.unt.edu/{\textasciitilde}rada/wa/"
}

@InProceedings{ simard_using_1993,
	address = "Toronto, Ontario, Canada",
	title = "Using cognates to align sentences in bilingual corpora",
	url = "http://portal.acm.org/citation.cfm?id=962367.962411\&coll=GUIDE\&dl=GUIDE\&CFID=76577594\&CFTOKEN=73477001",
	abstract = "In a recent paper, Gale and Church describe an inexpensive method for aligning bitext, based exclusively on sentence lengths [3]. While this method produces surprisingly good results (a success rate around 96\%), even better results are required to perform such tasks as the computer-assisted revision of translations. In this paper, we examine some of the weaknesses of Gale and Church's program, and explain how just a small amount of linguistic knowledge would help to overcome these weaknesses. We discuss how cognates provide for a cheap and reasonably reliable source of linguistic knowledge. To illustrate this, we describe a modification to the program in which the criterion is cognates rather than sentence lengths. Finally, we show how better and more efficient results may be obtained by combining the two criteria length and ``cogneteness''. Our method can be generalized to accommodate other sources of linguistic knowledge, and experimentation shows that it produces better results than alignments based on length alone, at a minimal cost.",
	booktitle = "Proceedings of the 1993 conference of the Centre for Advanced Studies on Collaborative research: distributed computing - Volume 2",
	publisher = "{IBM} Press",
	author = "Michel Simard and George F. Foster and Pierre Isabelle",
	year = "1993",
	pages = "1071--1082",
	annote = "{\textless}p{\textgreater}algo de programmation dynamique d{\'e}pendant d'information lexiacal : les cognats{\textless}/p{\textgreater}"
}

@Book{ kripke_naming_1981,
	title = "Naming and necessity",
	publisher = "{Wiley-Blackwell}",
	author = "Saul A. Kripke",
	year = "1981",
	note = "{KRI} 81"
}

@InCollection{ lardilleux_hapax_2007,
	title = "Hapax Legomena: Their Contribution in Number and Efficiency to Word Alignment",
	isbn = "978-3-642-04234-8",
	shorttitle = "Hapax Legomena",
	url = "http://portal.acm.org/citation.cfm?id=1616991",
	abstract = "Current techniques in word alignment disregard words with a low frequency because they would not be useful. Against this belief, this paper shows that, in particular, the notion of hapax legomena may contribute to word alignment to a large extent. In an experiment, we show that pairs of corpus hapaxes contribute to the majority of the best word alignments. In addition, we show that the notion of sentence hapax justifies a practical and common simplification of standard alignment methods.",
	booktitle = "Human Language Technology. Challenges of the Information Society: Third Language and Technology Conference, {LTC} 2007, Poznan, Poland, October 5-7, 2007, Revised Selected Papers",
	publisher = "{Springer-Verlag}",
	author = "Adrien Lardilleux and Yves Lepage",
	year = "2007",
	keywords = "hapax, low frequency term, word alignment",
	pages = "440--450"
}

@InProceedings{ sadat_systeme_2006,
	address = "Leuven, Belgique",
	title = "Syst{\`e}me de traduction automatique statistique combinant diff{\'e}rentes ressources",
	url = "http://www.iro.umontreal.ca/~foster/papers/taln06.pdf",
	abstract = "Cet article d{\'e}crit une approche combinant diff{\'e}rents mod{\`e}les statistiques pour la traduction automatique bas{\'e}e sur les segments. Pour ce faire, diff{\'e}rentes ressources sont utilis{\'e}es, dont deux corpus parall{\`e}les aux caract{\'e}ristiques diff{\'e}rentes et un dictionnaire de terminologie bilingue et ce, afin d’am{\'e}liorer la performance quantitative et qualitative du syst{\`e}me de traduction. Nous {\'e}valuons notre approche sur la paire de langue fran\c{c}ais-anglais et montrons comment la combinaison des ressources propos{\'e}e am{\'e}liore de fa\c{c}on significative les r{\'e}sultats.",
	booktitle = "Actes de la 16{\`e}me conf{\'e}rence annuelle sur le Traitement Automatique des Langues Naturelles, 10-13 avril",
	author = "Fatiha Sadat and George Foster and Roland Kuhn",
	year = "2006",
	keywords = "corpus parall{\`e}le, dictionnaire de terminologie bilingue, Traduction automatique statistique bas{\'e}e sur les segments"
}

@InProceedings{ gyongyi_web_2005,
	title = "Web Spam Taxonomy",
	url = "http://ilpubs.stanford.edu:8090/771/",
	booktitle = "Proceedings of the First International Workshop on Adversarial Information Retrieval on the Web {(AIRWeb} 2005)",
	author = "Zoltan Gyongyi and Hector {Garcia-Molina}",
	month = apr,
	year = "2005",
	note = "{GYO} 05",
	keywords = "Databases and the Web"
}

@InProceedings{ jardino_identification_2006,
	address = "Besan\c{c}on",
	title = "Identification des auteurs de textes courts avec des n-grammes de caract{\`e}res",
	volume = "1",
	booktitle = "Actes des 8es Journ{\'e}es internationales d'analyse statistique des Donn{\'e}es Textuelles",
	author = "Mich{\`e}le Jardino",
	month = apr,
	year = "2006",
	pages = "543"
}

@InProceedings{ ide_multext:_1994,
	address = "Kyoto, Japan",
	title = "{MULTEXT:} Multilingual Text Tools and Corpora",
	shorttitle = "{MULTEXT}",
	url = "http://portal.acm.org/citation.cfm?id=991990",
	abstract = "{MULTEXT} {(Multilingual} Text Tools and Corpora) is the largest project funded in the Commission of European Communities Linguistic Research and Engineering Program. The project will contribute to the development of generally usable software tools to manipulate and analyse text corpora and to create multilingual text corpora with structural and linguistic markup. It will attempt to establish conventions for the encoding of such corpora, building on and contributing to the preliminary recommendations of the relevant international and European standardization initiatives. {MULTEXT} will also work towards establishing a set of guidelines for text software development, which will be widely published in order to enable future development by others. All tools and data developed within the project will be made freely and publicly available.",
	booktitle = "Proceedings of the 15th conference on Computational linguistics - Volume 1",
	publisher = "Association for Computational Linguistics",
	author = "Nancy Ide and Jean V{\'e}ronis",
	year = "1994",
	keywords = "corpus annotation, multi-lingual corpora, text markup, text software",
	pages = "588--592"
}

@InProceedings{ chen_aligning_1993,
	address = "Columbus, Ohio",
	title = "Aligning sentences in bilingual corpora using lexical information",
	url = "http://portal.acm.org/citation.cfm?id=981574.981576\&coll=GUIDE\&dl=GUIDE\&CFID=76577594\&CFTOKEN=73477001",
	abstract = "In this paper, we describe a fast algorithm for aligning sentences with their translations in a bilingual corpus. Existing efficient algorithms ignore word identities and only consider sentence length {(Brown} et al., 1991b; Gale and Church, 1991). Our algorithm constructs a simple statistical word-to-word translation model on the fly during alignment. We find the alignment that maximizes the probability of generating the corpus with this translation model. We have achieved an error rate of approximately 0.4\% on Canadian Hansard data, which is a significant improvement over previous results. The algorithm is language independent.",
	booktitle = "Proceedings of the 31st annual meeting on Association for Computational Linguistics",
	publisher = "Association for Computational Linguistics",
	author = "Stanley F. Chen",
	year = "1993",
	pages = "9--16",
	annote = "{\textless}p{\textgreater}la programmation dynamique est sensible au suppression survenant dans une des langues{\textless}/p{\textgreater}"
}

@InCollection{ abney_parsing_1991,
	address = "Dordrecht",
	title = "Parsing By Chunks",
	url = "http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.11.8199",
	abstract = "Introduction I begin with an intuition: when I read a sentence, I read it a chunk at a time. For example, the previous sentence breaks up something like this: (1) {[I} begin] [with an intuition]: [when I read] [a sentence], {[I} read it] [a chunk] [at a time] These chunks correspond in some way to prosodic patterns. It appears, for instance, that the strongest stresses in the sentence fall one to a chunk, and pauses are most likely to fall between chunks. Chunks also represent a grammatical...",
	booktitle = "Robert Berwick, Steven Abney et Carol Tenny, editeurs, {Principle-Based} Parsing",
	publisher = "Kluwer Academic Publishers",
	author = "Steven P Abney",
	year = "1991",
	pages = "257--278"
}

@PhDThesis{ zimina-poirot_approches_2004,
	type = "Sciences du langage",
	title = "Approches quantitatives de l'extraction de ressources traductionnelles {\`a} partir de corpus parall{\`e}les",
	url = "http://hal.archives-ouvertes.fr/index.php?halsid=7hubfdttvo7pmuoousu7ulelg7\&view_this_doc=tel-00008311\&version=1",
	abstract = "Ce travail pr{\'e}sente les r{\'e}sultats d'une s{\'e}rie de recherches consacr{\'e}es au d{\'e}veloppement d'une nouvelle famille d'outils d'exploration textom{\'e}trique intertextuelle. De nombreuses m{\'e}thodes de statistique textuelle ont {\'e}t{\'e} articul{\'e}es et adapt{\'e}es au cadre multilingue : la m{\'e}thode des segments r{\'e}p{\'e}t{\'e}s, les sp{\'e}cificit{\'e}s, la topographie bi-textuelle, les cooccurrences multiples, l'analyse factorielle des correspondances, la classification automatique, etc. L'utilisation de chaque m{\'e}thode dans le contexte multilingue est illustr{\'e}e par des exemples d'applications concr{\`e}tes, accompagn{\'e}s d'{\'e}chantillons de ressources traductionnelles obtenues {\`a} partir du corpus parall{\`e}le fran\c{c}ais/anglais de la Convention de sauvegarde des Droits de {l'Homme.} Les perspectives ouvertes par cette approche offrent aux traducteurs, enseignants en langues {\'e}trang{\`e}res, terminologues, lexicographes, etc., des moyens automatis{\'e}s pour explorer la structure des {\'e}quivalences lexicales dans les corpus de traduction.",
	school = "Universit{\'e} Paris3 - Sorbonne Nouvelle",
	author = "Maria {Zimina-Poirot}",
	month = nov,
	year = "2004",
	note = "Ce travail pr{\'e}sente les r{\'e}sultats d'une s{\'e}rie de recherches consacr{\'e}es au d{\'e}veloppement d'une nouvelle famille d'outils d'exploration textom{\'e}trique intertextuelle. De nombreuses m{\'e}thodes de statistique textuelle ont {\'e}t{\'e} articul{\'e}es et adapt{\'e}es au cadre multilingue : la m{\'e}thode des segments r{\'e}p{\'e}t{\'e}s, les sp{\'e}cificit{\'e}s, la topographie bi-textuelle, les cooccurrences multiples, l'analyse factorielle des correspondances, la classification automatique, etc. L'utilisation de chaque m{\'e}thode dans le contexte multilingue est illustr{\'e}e par des exemples d'applications concr{\`e}tes, accompagn{\'e}s d'{\'e}chantillons de ressources traductionnelles obtenues {\`a} partir du corpus parall{\`e}le fran\c{c}ais/anglais de la Convention de sauvegarde des Droits de {l'Homme.} Les perspectives ouvertes par cette approche offrent aux traducteurs, enseignants en langues {\'e}trang{\`e}res, terminologues, lexicographes, etc., des moyens automatis{\'e}s pour explorer la structure des {\'e}quivalences lexicales dans les corpus de traduction.",
	keywords = "Alignement, bi-texte, corpus parall{\`e}les, correspondances traductionnelles, statistique textuelle, textom{\'e}trie, topographie textuelle"
}

@InProceedings{ biskri_les_2001,
	address = "Tours, France",
	title = "Les n-grams de caract{\`e}res pour l'extraction de connaissances dans des bases de donn{\'e}es textuelles multilingues",
	url = "http://www.uqtr.ca/~biskri/",
	abstract = "Une v{\'e}ritable classification num{\'e}rique multilingue est impossible si on consid{\`e}re seulement le mot comme unit{\'e} d’information privil{\'e}gi{\'e}e. En traitant les mots comme jetons, la tokenisation s’av{\`e}re simple pour le fran\c{c}ais et l’anglais, mais tr{\`e}s difficile pour des langues comme l’allemand ou l’arabe. D’autre part, la lemmatisation utilis{\'e}e comme moyen de normalisation et de r{\'e}duction du lexique constitue un {\'e}cueil non moins n{\'e}gligeable. La notion de n-grams, qui depuis une d{\'e}cennie donne de bons r{\'e}sultats dans l’identification de la langue ou dans l’analyse de l’oral, est, par les recherches r{\'e}centes, devenue un axe privil{\'e}gi{\'e} dans l’acquisition et l’extraction des connaissances dans les textes. Dans cet article, nous pr{\'e}senterons un outil de classification num{\'e}rique bas{\'e} sur le concept de n-grams de caract{\`e}res. Nous {\'e}valuons aussi les r{\'e}sultats de cet outil que nous comparons {\`a} des r{\'e}sultats obtenus au moyen d’une classification fond{\'e}e sur des mots.",
	booktitle = "Actes de la 8{\`e}me conf{\'e}rence annuelle sur le Traitement Automatique des Langues Naturelles, 2-5 juillet",
	author = "Ismaïl Biskri and Sylvain Delisle",
	month = jul,
	year = "2001",
	keywords = "extraction de connaissances, fr{\'e}quence, lemmatisation, lexique, n-grams"
}

@Book{ cacaly_dictionnaire_2008,
	edition = "3e {\'e}dition",
	title = "Dictionnaire de l'information",
	isbn = "2200351321",
	publisher = "Armand Colin",
	author = "Serge Cacaly and {Yves-Fran\c{c}ois} Le Coadic and Eric Sutter and {Paul-Dominique} Pomart",
	month = mar,
	year = "2008",
	note = "{CAC} 08"
}

@InProceedings{ pincemin_lexicometrie_2004,
	address = "{Louvain-la-Neuve,} Belgique",
	title = "Lexicom{\'e}trie et corpus multilingues",
	volume = "2",
	url = "http://icar.univ-lyon2.fr/membres/bpincemin/publications.htm",
	abstract = "Par del{\`a} l’alignement automatique de corpus, fondamental mais b{\'e}n{\'e}ficiant d{\'e}j{\`a} d’une certaine maturit{\'e} et d’une bonne notori{\'e}t{\'e}, cette table-ronde s’int{\'e}resse {\`a} l’extension des techniques et applications lexicom{\'e}triques en contexte multilingue. Sont bien {\'e}videmment concern{\'e}s les corpus parall{\`e}les, pour l’analyse et l’exploitation desquels peuvent {\{\\^e}}tre propos{\'e}s de nouveaux outils et m{\'e}thodes. Sont {\'e}galement en jeux les corpus comparables (non align{\'e}s), qu’il s’agit d’{\{\\^e}}tre capable d’exploiter moyennent l’adaptation de proc{\'e}dures d’analyse statistique jusque l{\`a} pratiqu{\'e}es sur des corpus monolingues. Sont enfin {\'e}galement consid{\'e}r{\'e}s les corpus dont la langue introduit de nouvelles questions th{\'e}oriques et techniques pour les logiciels de lexicom{\'e}trie, par exemple pour le codage des caract{\`e}res, le d{\'e}coupage en ``< mots ''>, ou le codage des informations morphosyntaxiques. Des normes internationales r{\'e}centes et en cours d’{\'e}laboration guident maintenant le codage des corpus et des ressources linguistiques de tous ordres. Prenant en compte la diversit{\'e} des langues, elles visent {\`a} favoriser la comparabilit{\'e} et la r{\'e}utilisabilit{\'e} des donn{\'e}es textuelles.",
	booktitle = "Actes des {JADT'04}",
	publisher = "Presse universitaires de Louvain",
	author = "B{\'e}n{\'e}dicte Pincemin",
	year = "2004",
	keywords = "analyse canonique non lin{\'e}aire, classification automatique, corpus comparables, corpus en langues orientales, corpus parall{\`e}les, d{\'e}coupage en mots, {\'e}tiquetage linguistique, interfaces de navigation, jeux de caract{\`e}res, lexiques multilingues, normalisation, repr{\'e}sentation topographique de textes parall{\`e}les, r{\'e}seaux de cooccurrences, segments r{\'e}p{\'e}t{\'e}s, sp{\'e}cificit{\'e}s, statistique textuelle",
	pages = "1203--1206"
}

@Book{ baeza-yates_modern_1999,
	title = "Modern Information Retrieval",
	isbn = "{020139829X}",
	url = "http://portal.acm.org/citation.cfm?id=553876\&coll=GUIDE\&dl=GUIDE\&CFID=85411802\&CFTOKEN=16272027",
	abstract = "From the {Publisher:This} is a rigorous and complete textbook for a first course on information retrieval from the computer science (as opposed to a user-centred) perspective. The advent of the Internet and the enormous increase in volume of electronically stored information generally has led to substantial work on {IR} from the computer science perspective - this book provides an up-to-date student oriented treatment of the subject.",
	publisher = "{Addison-Wesley} Longman Publishing Co., Inc.",
	author = "Ricardo A. {Baeza-Yates} and Berthier {Ribeiro-Neto}",
	year = "1999",
	note = "{BAE} 99"
}

@Book{ kleiber_problemes_1981,
	address = "Paris",
	edition = "Etudes publi{\'e}es par le Centre {d’Analyse} Syntaxique de {l’Universit{\'e}} de Metz",
	series = "Recherches Linguistiques",
	title = "Probl{\`e}mes de r{\'e}f{\'e}rence. Descriptions d{\'e}finies et noms propres",
	number = "{VI}",
	publisher = "Klincksieck",
	author = "Georges Kleiber",
	year = "1981",
	note = "{KLE} 81"
}

@Book{ neveu_dictionnaire_2004,
	title = "Dictionnaire des sciences du langage",
	isbn = "2200263783",
	publisher = "Armand Colin",
	author = "Franck Neveu",
	month = nov,
	year = "2004",
	note = "{NEU} 04"
}

@InProceedings{ ahrenberg_evaluating_2000,
	title = "Evaluating Word Alignment Systems",
	volume = "3",
	url = "http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.49.4963",
	abstract = "Recent years have seen a few serious attempts to develop methods and measures for the evaluation of word alignment systems, notably the Blinker project {(Melamed,} 1998) and the {ARCADE} project {(V{\'e}ronis} and Langlais, forthcoming). In this paper we discuss different approaches to the problem and report on results from a project where two word alignment systems have been evaluated. These results include methods and tools for the generation of reference data and a set of measures for system performance. We note that the selection and sampling of reference data can have a great impact on scoring results.",
	booktitle = "Proceedings of the Second International Conference on Language Resources and Evaluation {(LREC)}",
	author = "Lars Ahrenberg and Magnus Merkel",
	year = "2000",
	pages = "1255--1261",
	annote = "{\textless}p{\textgreater}difficult{\'e}s d'alignement des unit{\'e}s complexes, not. en su{\'e}dois {(Blank,} 2000) et en allemand {(Ahrenberg,} 2000, Van Der Eijk, 1993, Jones, 1994){\textless}/p{\textgreater} {\textless}p{\textgreater}+ difficult{\'e}s particuli{\`e}res sur les mots grammaticaux, encore moins bi-univoques{\textless}/p{\textgreater} {\textless}p{\textgreater}techniques de s{\'e}lections d'unit{\'e}s complexes {(Lafon,} 1994 ; Church, 1990 ; Smadja, 1990){\textless}/p{\textgreater} {\textless}p{\textgreater} {\textless}/p{\textgreater}"
}

@InProceedings{ bender_linguistically_2009,
	address = "Ath{\`e}nes, Gr{\`e}ce",
	title = "Linguistically Naïve! = Language Independent: Why {NLP} Needs Linguistic Typology",
	shorttitle = "Linguistically Naïve! = Language Independent",
	url = "http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.164.6013",
	booktitle = "{ILCL} '09, Proceedings of the {EACL} 2009 Workshop on the Interaction between Linguistics and Computational Linguistics: Virtuous, Vicious or Vacuous?, , European chapter of the Association for Computational Linguistics {EACL} 26-32.",
	author = "Emily M Bender",
	year = "2009"
}

@Misc{ _carmel_????,
	title = "{CARMEL}",
	url = "http://www.technolangue.net/imprimer.php3?id_article=142",
	howpublished = "http://www.technolangue.net/imprimer.php3?id\_article=142"
}

@Article{ church_dotplot:_1993,
	title = "Dotplot: A Program for Exploring {Self-Similarity} in Millions of Lines of Text and Code",
	volume = "2",
	issn = "10618600",
	shorttitle = "Dotplot",
	url = "http://www.jstor.org/stable/1390697",
	doi = "10.2307/1390697",
	abstract = "{{\textless}p{\textgreater}An} interactive program, dotplot, has been developed for browsing millions of lines of text and source code, using an approach borrowed from biology for studying homology (self-similarity) in {DNA} sequences. With conventional browsing tools such as a screen editor, it is difficult to identify structures that are too big to fit on the screen. In contrast, with dotplots we find that many of these structures show up as diagonals, squares, textures, and other visually recognizable features, as will be illustrated in examples selected from biology and two new application domains, text {(AP} news, Canadian Hansards) and source code {(5ESS{\textregistered}).} In an attempt to isolate the mechanisms that produce these features, we have synthesized similar features in dotplots of artificial sequences. We also introduce an approximation that makes the calculation of dotplots practical for use in an interactive browser.{\textless}/p{\textgreater}",
	number = "2",
	journal = "Journal of Computational and Graphical Statistics",
	author = "Kenneth Ward Church and Jonathan Isaac Helfman",
	month = jun,
	year = "1993",
	note = "{ArticleType:} research-article / Full publication date: Jun., 1993 / Copyright {\textcopyright} 1993 American Statistical Association, Institute of Mathematical Statistics and Interface Foundation of America",
	pages = "153--174"
}

@Article{ dagan_robust_1993,
	title = "Robust Bilingual Word Alignment for Machine Aided Translation",
	volume = "1",
	url = "http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.14.4941",
	journal = "{IN} {PROCEEDINGS} {OF} {THE} {WORKSHOP} {ON} {VERY} {LARGE} {CORPORA}",
	author = "Ido Dagan and Kenneth W Church and William A Gale",
	year = "1993",
	pages = "1----8"
}

